This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
silence warning
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
be3c0a43 70 **** Copyright (c) 1991-2002, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
eb160463 90#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
ba7b4546 99#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 100
c277df42
IZ
101/*
102 * Forwards.
103 */
104
33b8afdf 105#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 106#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 107
dfe13c55
GS
108#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
109#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
110#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
111#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
112#define HOPc(pos,off) ((char*)HOP(pos,off))
113#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 114
efb30f32 115#define HOPBACK(pos, off) ( \
e54858b0 116 (PL_reg_match_utf8) \
efb30f32
HS
117 ? reghopmaybe((U8*)pos, -off) \
118 : (pos - off >= PL_bostr) \
119 ? (U8*)(pos - off) \
120 : (U8*)NULL \
121)
122#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
123
1aa99e6b
IH
124#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
125#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
127#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
130
977e4b84 131#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 132
5f80c4cf 133/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
134#define JUMPABLE(rn) ( \
135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
137 OP(rn) == PLUS || OP(rn) == MINMOD || \
138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
139)
140
cca55fe3
JP
141#define HAS_TEXT(rn) ( \
142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
143)
e2d8ce26 144
a84d97b6
HS
145/*
146 Search for mandatory following text node; for lookahead, the text must
147 follow but for lookbehind (rn->flags != 0) we skip to the next step.
148*/
cca55fe3 149#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 150 while (JUMPABLE(rn)) \
a84d97b6 151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 152 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
153 else if (OP(rn) == PLUS) \
154 rn = NEXTOPER(rn); \
a84d97b6
HS
155 else if (OP(rn) == IFMATCH) \
156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 157 else rn += NEXT_OFF(rn); \
5f80c4cf 158} STMT_END
74750237 159
acfe0abc 160static void restore_pos(pTHX_ void *arg);
51371543 161
76e3520e 162STATIC CHECKPOINT
cea2e8a9 163S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 164{
3280af22 165 int retval = PL_savestack_ix;
b1ce53c5
JH
166#define REGCP_PAREN_ELEMS 4
167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
168 int p;
169
e49a9654
IH
170 if (paren_elems_to_push < 0)
171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
172
a01268b5 173#define REGCP_OTHER_ELEMS 6
b1ce53c5 174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 175 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 176/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
177 SSPUSHINT(PL_regendp[p]);
178 SSPUSHINT(PL_regstartp[p]);
3280af22 179 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
180 SSPUSHINT(p);
181 }
b1ce53c5 182/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
183 SSPUSHINT(PL_regsize);
184 SSPUSHINT(*PL_reglastparen);
a01268b5 185 SSPUSHINT(*PL_reglastcloseparen);
3280af22 186 SSPUSHPTR(PL_reginput);
41123dfd
JH
187#define REGCP_FRAME_ELEMS 2
188/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
189 * are needed for the regexp context stack bookkeeping. */
190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 192
a0d0e21e
LW
193 return retval;
194}
195
c277df42 196/* These are needed since we do not localize EVAL nodes: */
02db2b7b 197# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 199 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 200
02db2b7b 201# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 202 PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 205
76e3520e 206STATIC char *
cea2e8a9 207S_regcppop(pTHX)
a0d0e21e 208{
b1ce53c5 209 I32 i;
a0d0e21e
LW
210 U32 paren = 0;
211 char *input;
cf93c79d 212 I32 tmps;
b1ce53c5
JH
213
214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 215 i = SSPOPINT;
b1ce53c5
JH
216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
217 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 218 input = (char *) SSPOPPTR;
a01268b5 219 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
220 *PL_reglastparen = SSPOPINT;
221 PL_regsize = SSPOPINT;
b1ce53c5
JH
222
223 /* Now restore the parentheses context. */
41123dfd
JH
224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
225 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 226 paren = (U32)SSPOPINT;
3280af22 227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
228 PL_regstartp[paren] = SSPOPINT;
229 tmps = SSPOPINT;
3280af22
NIS
230 if (paren <= *PL_reglastparen)
231 PL_regendp[paren] = tmps;
c277df42 232 DEBUG_r(
c3464db5 233 PerlIO_printf(Perl_debug_log,
b900a521 234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 235 (UV)paren, (IV)PL_regstartp[paren],
b900a521 236 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 237 (IV)PL_regendp[paren],
3280af22 238 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 239 );
a0d0e21e 240 }
c277df42 241 DEBUG_r(
eb160463 242 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 243 PerlIO_printf(Perl_debug_log,
faccc32b
JH
244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
246 }
247 );
daf18116 248#if 1
dafc8851
JH
249 /* It would seem that the similar code in regtry()
250 * already takes care of this, and in fact it is in
251 * a better location to since this code can #if 0-ed out
252 * but the code in regtry() is needed or otherwise tests
253 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
254 * (as of patchlevel 7877) will fail. Then again,
255 * this code seems to be necessary or otherwise
256 * building DynaLoader will fail:
257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
258 * --jhi */
eb160463
GS
259 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
260 if ((I32)paren > PL_regsize)
cf93c79d
IZ
261 PL_regstartp[paren] = -1;
262 PL_regendp[paren] = -1;
a0d0e21e 263 }
dafc8851 264#endif
a0d0e21e
LW
265 return input;
266}
267
0f5d15d6 268STATIC char *
cea2e8a9 269S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
270{
271 I32 tmp = PL_savestack_ix;
272
273 PL_savestack_ix = ss;
274 regcppop();
275 PL_savestack_ix = tmp;
942e002e 276 return Nullch;
0f5d15d6
IZ
277}
278
279typedef struct re_cc_state
280{
281 I32 ss;
282 regnode *node;
283 struct re_cc_state *prev;
284 CURCUR *cc;
285 regexp *re;
286} re_cc_state;
287
02db2b7b 288#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 289
29d1e993
HS
290#define TRYPAREN(paren, n, input) { \
291 if (paren) { \
292 if (n) { \
293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
294 PL_regendp[paren] = input - PL_bostr; \
295 } \
296 else \
297 PL_regendp[paren] = -1; \
298 } \
299 if (regmatch(next)) \
300 sayYES; \
301 if (paren && n) \
302 PL_regendp[paren] = -1; \
303}
304
305
a687059c 306/*
e50aee73 307 * pregexec and friends
a687059c
LW
308 */
309
310/*
c277df42 311 - pregexec - match a regexp against a string
a687059c 312 */
c277df42 313I32
864dbfa3 314Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 315 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
316/* strend: pointer to null at end of string */
317/* strbeg: real beginning of string */
318/* minend: end of match must be >=minend after stringarg. */
319/* nosave: For optimizations. */
320{
321 return
9041c2e3 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
323 nosave ? 0 : REXEC_COPY_STR);
324}
0f5d15d6
IZ
325
326STATIC void
cea2e8a9 327S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
328{
329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
330#ifdef DEBUGGING
331 PL_regprogram = prog->program;
332#endif
333 PL_regnpar = prog->nparens;
9041c2e3
NIS
334 PL_regdata = prog->data;
335 PL_reg_re = prog;
0f5d15d6 336}
22e551b9 337
9041c2e3 338/*
cad2e5aa
JH
339 * Need to implement the following flags for reg_anch:
340 *
341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
342 * USE_INTUIT_ML
343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
344 * INTUIT_AUTORITATIVE_ML
345 * INTUIT_ONCE_NOML - Intuit can match in one location only.
346 * INTUIT_ONCE_ML
347 *
348 * Another flag for this function: SECOND_TIME (so that float substrs
349 * with giant delta may be not rechecked).
350 */
351
352/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
353
2c2d71f5 354/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
355 Otherwise, only SvCUR(sv) is used to get strbeg. */
356
357/* XXXX We assume that strpos is strbeg unless sv. */
358
6eb5f6b9
JH
359/* XXXX Some places assume that there is a fixed substring.
360 An update may be needed if optimizer marks as "INTUITable"
361 RExen without fixed substrings. Similarly, it is assumed that
362 lengths of all the strings are no more than minlen, thus they
363 cannot come from lookahead.
364 (Or minlen should take into account lookahead.) */
365
2c2d71f5
JH
366/* A failure to find a constant substring means that there is no need to make
367 an expensive call to REx engine, thus we celebrate a failure. Similarly,
368 finding a substring too deep into the string means that less calls to
30944b6d
IZ
369 regtry() should be needed.
370
371 REx compiler's optimizer found 4 possible hints:
372 a) Anchored substring;
373 b) Fixed substring;
374 c) Whether we are anchored (beginning-of-line or \G);
375 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 376 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
377 string which does not contradict any of them.
378 */
2c2d71f5 379
6eb5f6b9
JH
380/* Most of decisions we do here should have been done at compile time.
381 The nodes of the REx which we used for the search should have been
382 deleted from the finite automaton. */
383
cad2e5aa
JH
384char *
385Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
386 char *strend, U32 flags, re_scream_pos_data *data)
387{
b7953727 388 register I32 start_shift = 0;
cad2e5aa 389 /* Should be nonnegative! */
b7953727 390 register I32 end_shift = 0;
2c2d71f5
JH
391 register char *s;
392 register SV *check;
a1933d95 393 char *strbeg;
cad2e5aa 394 char *t;
33b8afdf 395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 396 I32 ml_anch;
6eb5f6b9 397 register char *other_last = Nullch; /* other substr checked before this */
b7953727 398 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
399#ifdef DEBUGGING
400 char *i_strpos = strpos;
ce333219 401 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 402#endif
cad2e5aa 403
b8d68ded
JH
404 if (prog->reganch & ROPT_UTF8) {
405 DEBUG_r(PerlIO_printf(Perl_debug_log,
406 "UTF-8 regex...\n"));
407 PL_reg_flags |= RF_utf8;
408 }
409
2a782b5b 410 DEBUG_r({
b8d68ded 411 char *s = PL_reg_match_utf8 ?
c728cb41
JH
412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
413 strpos;
b8d68ded
JH
414 int len = PL_reg_match_utf8 ?
415 strlen(s) : strend - strpos;
2a782b5b
JH
416 if (!PL_colorset)
417 reginitcolors();
b8d68ded
JH
418 if (PL_reg_match_utf8)
419 DEBUG_r(PerlIO_printf(Perl_debug_log,
420 "UTF-8 target...\n"));
2a782b5b
JH
421 PerlIO_printf(Perl_debug_log,
422 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
423 PL_colors[4],PL_colors[5],PL_colors[0],
424 prog->precomp,
425 PL_colors[1],
426 (strlen(prog->precomp) > 60 ? "..." : ""),
427 PL_colors[0],
428 (int)(len > 60 ? 60 : len),
429 s, PL_colors[1],
430 (len > 60 ? "..." : "")
431 );
432 });
cad2e5aa 433
c344f387
JH
434 /* CHR_DIST() would be more correct here but it makes things slow. */
435 if (prog->minlen > strend - strpos) {
a72c7584
JH
436 DEBUG_r(PerlIO_printf(Perl_debug_log,
437 "String too short... [re_intuit_start]\n"));
cad2e5aa 438 goto fail;
2c2d71f5 439 }
a1933d95 440 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 441 PL_regeol = strend;
33b8afdf
JH
442 if (do_utf8) {
443 if (!prog->check_utf8 && prog->check_substr)
444 to_utf8_substr(prog);
445 check = prog->check_utf8;
446 } else {
447 if (!prog->check_substr && prog->check_utf8)
448 to_byte_substr(prog);
449 check = prog->check_substr;
450 }
451 if (check == &PL_sv_undef) {
452 DEBUG_r(PerlIO_printf(Perl_debug_log,
453 "Non-utf string cannot match utf check string\n"));
454 goto fail;
455 }
2c2d71f5 456 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
457 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
458 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 459 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 460
7e25d62c
JH
461 if (!ml_anch) {
462 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
463 | ROPT_IMPLICIT)) /* not a real BOL */
464 /* SvCUR is not set on references: SvRV and SvPVX overlap */
465 && sv && !SvROK(sv)
466 && (strpos != strbeg)) {
467 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
468 goto fail;
469 }
470 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 471 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 472 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
473 I32 slen;
474
1aa99e6b 475 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
476 if (SvTAIL(check)) {
477 slen = SvCUR(check); /* >= 1 */
cad2e5aa 478
9041c2e3 479 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
480 || (strend - s == slen && strend[-1] != '\n')) {
481 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
482 goto fail_finish;
cad2e5aa
JH
483 }
484 /* Now should match s[0..slen-2] */
485 slen--;
653099ff 486 if (slen && (*SvPVX(check) != *s
cad2e5aa 487 || (slen > 1
653099ff 488 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
489 report_neq:
490 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
491 goto fail_finish;
492 }
cad2e5aa 493 }
653099ff
GS
494 else if (*SvPVX(check) != *s
495 || ((slen = SvCUR(check)) > 1
496 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
497 goto report_neq;
498 goto success_at_start;
7e25d62c 499 }
cad2e5aa 500 }
2c2d71f5 501 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 502 s = strpos;
2c2d71f5 503 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 504 end_shift = prog->minlen - start_shift -
653099ff 505 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 506 if (!ml_anch) {
653099ff
GS
507 I32 end = prog->check_offset_max + CHR_SVLEN(check)
508 - (SvTAIL(check) != 0);
1aa99e6b 509 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
510
511 if (end_shift < eshift)
512 end_shift = eshift;
513 }
cad2e5aa 514 }
2c2d71f5 515 else { /* Can match at random position */
cad2e5aa
JH
516 ml_anch = 0;
517 s = strpos;
2c2d71f5
JH
518 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
519 /* Should be nonnegative! */
520 end_shift = prog->minlen - start_shift -
653099ff 521 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
522 }
523
2c2d71f5 524#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 525 if (end_shift < 0)
6bbae5e6 526 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
527#endif
528
2c2d71f5
JH
529 restart:
530 /* Find a possible match in the region s..strend by looking for
531 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 532 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
533 I32 p = -1; /* Internal iterator of scream. */
534 I32 *pp = data ? data->scream_pos : &p;
535
2c2d71f5
JH
536 if (PL_screamfirst[BmRARE(check)] >= 0
537 || ( BmRARE(check) == '\n'
538 && (BmPREVIOUS(check) == SvCUR(check) - 1)
539 && SvTAIL(check) ))
9041c2e3 540 s = screaminstr(sv, check,
2c2d71f5 541 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 542 else
2c2d71f5 543 goto fail_finish;
cad2e5aa
JH
544 if (data)
545 *data->scream_olds = s;
546 }
f33976b4 547 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
548 s = fbm_instr((U8*)(s + start_shift),
549 (U8*)(strend - end_shift),
550 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 551 else
1aa99e6b
IH
552 s = fbm_instr(HOP3(s, start_shift, strend),
553 HOP3(strend, -end_shift, strbeg),
2c2d71f5 554 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
555
556 /* Update the count-of-usability, remove useless subpatterns,
557 unshift s. */
2c2d71f5
JH
558
559 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
560 (s ? "Found" : "Did not find"),
33b8afdf 561 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 562 PL_colors[0],
7b0972df
JH
563 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
564 SvPVX(check),
2c2d71f5
JH
565 PL_colors[1], (SvTAIL(check) ? "$" : ""),
566 (s ? " at offset " : "...\n") ) );
567
568 if (!s)
569 goto fail_finish;
570
6eb5f6b9
JH
571 check_at = s;
572
2c2d71f5 573 /* Finish the diagnostic message */
30944b6d 574 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
575
576 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
577 Start with the other substr.
578 XXXX no SCREAM optimization yet - and a very coarse implementation
579 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
580 *always* match. Probably should be marked during compile...
581 Probably it is right to do no SCREAM here...
582 */
583
33b8afdf 584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 585 /* Take into account the "other" substring. */
2c2d71f5
JH
586 /* XXXX May be hopelessly wrong for UTF... */
587 if (!other_last)
6eb5f6b9 588 other_last = strpos;
33b8afdf 589 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
590 do_other_anchored:
591 {
1aa99e6b 592 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 593 char *s1 = s;
33b8afdf 594 SV* must;
2c2d71f5 595
2c2d71f5
JH
596 t = s - prog->check_offset_max;
597 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 598 && (!do_utf8
1aa99e6b 599 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 600 && t > strpos)))
30944b6d 601 /* EMPTY */;
2c2d71f5
JH
602 else
603 t = strpos;
1aa99e6b 604 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
605 if (t < other_last) /* These positions already checked */
606 t = other_last;
1aa99e6b 607 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
608 if (last < last1)
609 last1 = last;
610 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
611 /* On end-of-str: see comment below. */
33b8afdf
JH
612 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
613 if (must == &PL_sv_undef) {
614 s = (char*)NULL;
615 DEBUG_r(must = prog->anchored_utf8); /* for debug */
616 }
617 else
618 s = fbm_instr(
619 (unsigned char*)t,
620 HOP3(HOP3(last1, prog->anchored_offset, strend)
621 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
622 must,
623 PL_multiline ? FBMrf_MULTILINE : 0
624 );
1aa99e6b
IH
625 DEBUG_r(PerlIO_printf(Perl_debug_log,
626 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
627 (s ? "Found" : "Contradicts"),
628 PL_colors[0],
33b8afdf
JH
629 (int)(SvCUR(must)
630 - (SvTAIL(must)!=0)),
631 SvPVX(must),
632 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
633 if (!s) {
634 if (last1 >= last2) {
635 DEBUG_r(PerlIO_printf(Perl_debug_log,
636 ", giving up...\n"));
637 goto fail_finish;
638 }
639 DEBUG_r(PerlIO_printf(Perl_debug_log,
640 ", trying floating at offset %ld...\n",
1aa99e6b
IH
641 (long)(HOP3c(s1, 1, strend) - i_strpos)));
642 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
643 s = HOP3c(last, 1, strend);
2c2d71f5
JH
644 goto restart;
645 }
646 else {
647 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 648 (long)(s - i_strpos)));
1aa99e6b
IH
649 t = HOP3c(s, -prog->anchored_offset, strbeg);
650 other_last = HOP3c(s, 1, strend);
30944b6d 651 s = s1;
2c2d71f5
JH
652 if (t == strpos)
653 goto try_at_start;
2c2d71f5
JH
654 goto try_at_offset;
655 }
30944b6d 656 }
2c2d71f5
JH
657 }
658 else { /* Take into account the floating substring. */
33b8afdf
JH
659 char *last, *last1;
660 char *s1 = s;
661 SV* must;
662
663 t = HOP3c(s, -start_shift, strbeg);
664 last1 = last =
665 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
666 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
667 last = HOP3c(t, prog->float_max_offset, strend);
668 s = HOP3c(t, prog->float_min_offset, strend);
669 if (s < other_last)
670 s = other_last;
2c2d71f5 671 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
672 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
673 /* fbm_instr() takes into account exact value of end-of-str
674 if the check is SvTAIL(ed). Since false positives are OK,
675 and end-of-str is not later than strend we are OK. */
676 if (must == &PL_sv_undef) {
677 s = (char*)NULL;
678 DEBUG_r(must = prog->float_utf8); /* for debug message */
679 }
680 else
2c2d71f5 681 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
682 (unsigned char*)last + SvCUR(must)
683 - (SvTAIL(must)!=0),
684 must, PL_multiline ? FBMrf_MULTILINE : 0);
685 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
686 (s ? "Found" : "Contradicts"),
687 PL_colors[0],
688 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
689 SvPVX(must),
690 PL_colors[1], (SvTAIL(must) ? "$" : "")));
691 if (!s) {
692 if (last1 == last) {
2c2d71f5 693 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
694 ", giving up...\n"));
695 goto fail_finish;
2c2d71f5 696 }
33b8afdf
JH
697 DEBUG_r(PerlIO_printf(Perl_debug_log,
698 ", trying anchored starting at offset %ld...\n",
699 (long)(s1 + 1 - i_strpos)));
700 other_last = last;
701 s = HOP3c(t, 1, strend);
702 goto restart;
703 }
704 else {
705 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
706 (long)(s - i_strpos)));
707 other_last = s; /* Fix this later. --Hugo */
708 s = s1;
709 if (t == strpos)
710 goto try_at_start;
711 goto try_at_offset;
712 }
2c2d71f5 713 }
cad2e5aa 714 }
2c2d71f5
JH
715
716 t = s - prog->check_offset_max;
2c2d71f5 717 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 718 && (!do_utf8
1aa99e6b
IH
719 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
720 && t > strpos))) {
2c2d71f5
JH
721 /* Fixed substring is found far enough so that the match
722 cannot start at strpos. */
723 try_at_offset:
cad2e5aa 724 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
725 /* Eventually fbm_*() should handle this, but often
726 anchored_offset is not 0, so this check will not be wasted. */
727 /* XXXX In the code below we prefer to look for "^" even in
728 presence of anchored substrings. And we search even
729 beyond the found float position. These pessimizations
730 are historical artefacts only. */
731 find_anchor:
2c2d71f5 732 while (t < strend - prog->minlen) {
cad2e5aa 733 if (*t == '\n') {
4ee3650e 734 if (t < check_at - prog->check_offset_min) {
33b8afdf 735 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
736 /* Since we moved from the found position,
737 we definitely contradict the found anchored
30944b6d
IZ
738 substr. Due to the above check we do not
739 contradict "check" substr.
740 Thus we can arrive here only if check substr
741 is float. Redo checking for "other"=="fixed".
742 */
9041c2e3 743 strpos = t + 1;
30944b6d
IZ
744 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
745 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
746 goto do_other_anchored;
747 }
4ee3650e
GS
748 /* We don't contradict the found floating substring. */
749 /* XXXX Why not check for STCLASS? */
cad2e5aa 750 s = t + 1;
2c2d71f5 751 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 752 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
753 goto set_useful;
754 }
4ee3650e
GS
755 /* Position contradicts check-string */
756 /* XXXX probably better to look for check-string
757 than for "\n", so one should lower the limit for t? */
758 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 759 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 760 other_last = strpos = s = t + 1;
cad2e5aa
JH
761 goto restart;
762 }
763 t++;
764 }
2c2d71f5
JH
765 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
766 PL_colors[0],PL_colors[1]));
767 goto fail_finish;
cad2e5aa 768 }
f5952150
GS
769 else {
770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
771 PL_colors[0],PL_colors[1]));
772 }
cad2e5aa
JH
773 s = t;
774 set_useful:
33b8afdf 775 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
776 }
777 else {
f5952150 778 /* The found string does not prohibit matching at strpos,
2c2d71f5 779 - no optimization of calling REx engine can be performed,
f5952150
GS
780 unless it was an MBOL and we are not after MBOL,
781 or a future STCLASS check will fail this. */
2c2d71f5
JH
782 try_at_start:
783 /* Even in this situation we may use MBOL flag if strpos is offset
784 wrt the start of the string. */
05b4157f 785 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 786 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
787 /* May be due to an implicit anchor of m{.*foo} */
788 && !(prog->reganch & ROPT_IMPLICIT))
789 {
cad2e5aa
JH
790 t = strpos;
791 goto find_anchor;
792 }
30944b6d 793 DEBUG_r( if (ml_anch)
f5952150
GS
794 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
795 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 796 );
2c2d71f5 797 success_at_start:
30944b6d 798 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
799 && (do_utf8 ? (
800 prog->check_utf8 /* Could be deleted already */
801 && --BmUSEFUL(prog->check_utf8) < 0
802 && (prog->check_utf8 == prog->float_utf8)
803 ) : (
804 prog->check_substr /* Could be deleted already */
805 && --BmUSEFUL(prog->check_substr) < 0
806 && (prog->check_substr == prog->float_substr)
807 )))
66e933ab 808 {
cad2e5aa 809 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 810 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
811 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
812 if (do_utf8 ? prog->check_substr : prog->check_utf8)
813 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
814 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
815 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 816 check = Nullsv; /* abort */
cad2e5aa 817 s = strpos;
3cf5c195
IZ
818 /* XXXX This is a remnant of the old implementation. It
819 looks wasteful, since now INTUIT can use many
6eb5f6b9 820 other heuristics. */
cad2e5aa
JH
821 prog->reganch &= ~RE_USE_INTUIT;
822 }
823 else
824 s = strpos;
825 }
826
6eb5f6b9
JH
827 /* Last resort... */
828 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
829 if (prog->regstclass) {
830 /* minlen == 0 is possible if regstclass is \b or \B,
831 and the fixed substr is ''$.
832 Since minlen is already taken into account, s+1 is before strend;
833 accidentally, minlen >= 1 guaranties no false positives at s + 1
834 even for \b or \B. But (minlen? 1 : 0) below assumes that
835 regstclass does not come from lookahead... */
836 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
837 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 838 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 839 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 840 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 841 : 1);
33b8afdf 842 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 843 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 844 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
845 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
846 cl_l, strend)
847 : strend);
a1933d95 848 char *startpos = strbeg;
6eb5f6b9
JH
849
850 t = s;
76384e4a 851 if (prog->reganch & ROPT_UTF8) {
ffc61ed2 852 PL_regdata = prog->data;
76384e4a
GS
853 PL_bostr = startpos;
854 }
f33976b4 855 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
856 if (!s) {
857#ifdef DEBUGGING
b7953727 858 char *what = 0;
6eb5f6b9
JH
859#endif
860 if (endpos == strend) {
861 DEBUG_r( PerlIO_printf(Perl_debug_log,
862 "Could not match STCLASS...\n") );
863 goto fail;
864 }
66e933ab
GS
865 DEBUG_r( PerlIO_printf(Perl_debug_log,
866 "This position contradicts STCLASS...\n") );
653099ff
GS
867 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
868 goto fail;
6eb5f6b9 869 /* Contradict one of substrings */
33b8afdf
JH
870 if (prog->anchored_substr || prog->anchored_utf8) {
871 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
872 DEBUG_r( what = "anchored" );
873 hop_and_restart:
1aa99e6b 874 s = HOP3c(t, 1, strend);
66e933ab
GS
875 if (s + start_shift + end_shift > strend) {
876 /* XXXX Should be taken into account earlier? */
877 DEBUG_r( PerlIO_printf(Perl_debug_log,
878 "Could not match STCLASS...\n") );
879 goto fail;
880 }
5e39e1e5
HS
881 if (!check)
882 goto giveup;
6eb5f6b9 883 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 884 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
885 what, (long)(s + start_shift - i_strpos)) );
886 goto restart;
887 }
66e933ab 888 /* Have both, check_string is floating */
6eb5f6b9
JH
889 if (t + start_shift >= check_at) /* Contradicts floating=check */
890 goto retry_floating_check;
891 /* Recheck anchored substring, but not floating... */
9041c2e3 892 s = check_at;
5e39e1e5
HS
893 if (!check)
894 goto giveup;
6eb5f6b9 895 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 896 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
897 (long)(other_last - i_strpos)) );
898 goto do_other_anchored;
899 }
60e71179
GS
900 /* Another way we could have checked stclass at the
901 current position only: */
902 if (ml_anch) {
903 s = t = t + 1;
5e39e1e5
HS
904 if (!check)
905 goto giveup;
60e71179 906 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
907 "Looking for /%s^%s/m starting at offset %ld...\n",
908 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 909 goto try_at_offset;
66e933ab 910 }
33b8afdf 911 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 912 goto fail;
6eb5f6b9
JH
913 /* Check is floating subtring. */
914 retry_floating_check:
915 t = check_at - start_shift;
916 DEBUG_r( what = "floating" );
917 goto hop_and_restart;
918 }
b7953727
JH
919 if (t != s) {
920 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 921 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
922 (long)(t - i_strpos), (long)(s - i_strpos))
923 );
924 }
925 else {
926 DEBUG_r(PerlIO_printf(Perl_debug_log,
927 "Does not contradict STCLASS...\n");
928 );
929 }
6eb5f6b9 930 }
5e39e1e5
HS
931 giveup:
932 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
933 PL_colors[4], (check ? "Guessed" : "Giving up"),
934 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 935 return s;
2c2d71f5
JH
936
937 fail_finish: /* Substring not found */
33b8afdf
JH
938 if (prog->check_substr || prog->check_utf8) /* could be removed already */
939 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 940 fail:
2c2d71f5 941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
942 PL_colors[4],PL_colors[5]));
943 return Nullch;
944}
9661b544 945
6eb5f6b9 946/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
947STATIC char *
948S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 949{
6eb5f6b9
JH
950 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
951 char *m;
d8093b23
G
952 STRLEN ln;
953 unsigned int c1;
954 unsigned int c2;
6eb5f6b9
JH
955 char *e;
956 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 957 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 958
6eb5f6b9
JH
959 /* We know what class it must start with. */
960 switch (OP(c)) {
6eb5f6b9 961 case ANYOF:
a0ed51b3 962 while (s < strend) {
7d3e948e
JH
963 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
964
965 if (do_utf8 ?
ba7b4546 966 reginclass(c, (U8*)s, 0, do_utf8) :
7d3e948e 967 REGINCLASS(c, (U8*)s) ||
ebc501f0
JH
968 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
969 /* The assignment of 2 is intentional:
970 * for the sharp s, the skip is 2. */
971 (skip = SHARP_S_SKIP)
972 )) {
6eb5f6b9
JH
973 if (tmp && (norun || regtry(prog, s)))
974 goto got_it;
975 else
976 tmp = doevery;
a0ed51b3 977 }
e0f9d4a8 978 else
7d3e948e 979 tmp = 1;
e0f9d4a8 980 s += skip;
a0d0e21e 981 }
6eb5f6b9 982 break;
f33976b4
DB
983 case CANY:
984 while (s < strend) {
985 if (tmp && (norun || regtry(prog, s)))
986 goto got_it;
987 else
988 tmp = doevery;
989 s++;
990 }
991 break;
6eb5f6b9
JH
992 case EXACTF:
993 m = STRING(c);
994 ln = STR_LEN(c);
1aa99e6b 995 if (UTF) {
a2a2844f 996 STRLEN ulen1, ulen2;
e7ae6809
JH
997 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
998 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
999
1000 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1001 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1002
872c91ae
JH
1003 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1004 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1005 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1006 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1007 }
1008 else {
1009 c1 = *(U8*)m;
1010 c2 = PL_fold[c1];
1011 }
6eb5f6b9
JH
1012 goto do_exactf;
1013 case EXACTFL:
1014 m = STRING(c);
1015 ln = STR_LEN(c);
d8093b23 1016 c1 = *(U8*)m;
6eb5f6b9
JH
1017 c2 = PL_fold_locale[c1];
1018 do_exactf:
6795a346 1019 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1020
6eb5f6b9
JH
1021 if (norun && e < s)
1022 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1023
60a8b682
JH
1024 /* The idea in the EXACTF* cases is to first find the
1025 * first character of the EXACTF* node and then, if
1026 * necessary, case-insensitively compare the full
1027 * text of the node. The c1 and c2 are the first
1028 * characters (though in Unicode it gets a bit
1029 * more complicated because there are more cases
7f16dd3d
JH
1030 * than just upper and lower: one needs to use
1031 * the so-called folding case for case-insensitive
1032 * matching (called "loose matching" in Unicode).
1033 * ibcmp_utf8() will do just that. */
60a8b682 1034
1aa99e6b 1035 if (do_utf8) {
575cac57
JH
1036 UV c, f;
1037 U8 tmpbuf [UTF8_MAXLEN+1];
1038 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1039 STRLEN len, foldlen;
d7f013c8 1040
09091399 1041 if (c1 == c2) {
1aa99e6b 1042 while (s <= e) {
872c91ae
JH
1043 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1044 ckWARN(WARN_UTF8) ?
1045 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1046 if ( c == c1
1047 && (ln == len ||
66423254 1048 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1049 m, (char **)0, ln, (bool)UTF))
55da9344 1050 && (norun || regtry(prog, s)) )
1aa99e6b 1051 goto got_it;
80aecb99
JH
1052 else {
1053 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1054 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1055 if ( f != c
1056 && (f == c1 || f == c2)
1057 && (ln == foldlen ||
66423254
JH
1058 !ibcmp_utf8((char *) foldbuf,
1059 (char **)0, foldlen, do_utf8,
d07ddd77 1060 m,
eb160463 1061 (char **)0, ln, (bool)UTF))
80aecb99
JH
1062 && (norun || regtry(prog, s)) )
1063 goto got_it;
1064 }
1aa99e6b
IH
1065 s += len;
1066 }
09091399
JH
1067 }
1068 else {
1aa99e6b 1069 while (s <= e) {
872c91ae
JH
1070 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1071 ckWARN(WARN_UTF8) ?
1072 0 : UTF8_ALLOW_ANY);
80aecb99 1073
60a8b682 1074 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1075 * Note that not all the possible combinations
1076 * are handled here: some of them are handled
1077 * by the standard folding rules, and some of
1078 * them (the character class or ANYOF cases)
1079 * are handled during compiletime in
1080 * regexec.c:S_regclass(). */
880bd946
JH
1081 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1082 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1083 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1084
1085 if ( (c == c1 || c == c2)
1086 && (ln == len ||
66423254 1087 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1088 m, (char **)0, ln, (bool)UTF))
55da9344 1089 && (norun || regtry(prog, s)) )
1aa99e6b 1090 goto got_it;
80aecb99
JH
1091 else {
1092 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1093 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1094 if ( f != c
1095 && (f == c1 || f == c2)
1096 && (ln == foldlen ||
a6872d42 1097 !ibcmp_utf8((char *) foldbuf,
66423254 1098 (char **)0, foldlen, do_utf8,
d07ddd77 1099 m,
eb160463 1100 (char **)0, ln, (bool)UTF))
80aecb99
JH
1101 && (norun || regtry(prog, s)) )
1102 goto got_it;
1103 }
1aa99e6b
IH
1104 s += len;
1105 }
09091399 1106 }
1aa99e6b
IH
1107 }
1108 else {
1109 if (c1 == c2)
1110 while (s <= e) {
1111 if ( *(U8*)s == c1
1112 && (ln == 1 || !(OP(c) == EXACTF
1113 ? ibcmp(s, m, ln)
1114 : ibcmp_locale(s, m, ln)))
1115 && (norun || regtry(prog, s)) )
1116 goto got_it;
1117 s++;
1118 }
1119 else
1120 while (s <= e) {
1121 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1122 && (ln == 1 || !(OP(c) == EXACTF
1123 ? ibcmp(s, m, ln)
1124 : ibcmp_locale(s, m, ln)))
1125 && (norun || regtry(prog, s)) )
1126 goto got_it;
1127 s++;
1128 }
b3c9acc1
IZ
1129 }
1130 break;
bbce6d69 1131 case BOUNDL:
3280af22 1132 PL_reg_flags |= RF_tainted;
bbce6d69 1133 /* FALL THROUGH */
a0d0e21e 1134 case BOUND:
ffc61ed2 1135 if (do_utf8) {
12d33761 1136 if (s == PL_bostr)
ffc61ed2
JH
1137 tmp = '\n';
1138 else {
b4f7163a 1139 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1140
b4f7163a 1141 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1142 }
1143 tmp = ((OP(c) == BOUND ?
9041c2e3 1144 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1145 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1146 while (s < strend) {
1147 if (tmp == !(OP(c) == BOUND ?
3568d838 1148 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1149 isALNUM_LC_utf8((U8*)s)))
1150 {
1151 tmp = !tmp;
1152 if ((norun || regtry(prog, s)))
1153 goto got_it;
1154 }
1155 s += UTF8SKIP(s);
a687059c 1156 }
a0d0e21e 1157 }
667bb95a 1158 else {
12d33761 1159 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1160 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1161 while (s < strend) {
1162 if (tmp ==
1163 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1164 tmp = !tmp;
1165 if ((norun || regtry(prog, s)))
1166 goto got_it;
1167 }
1168 s++;
a0ed51b3 1169 }
a0ed51b3 1170 }
6eb5f6b9 1171 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1172 goto got_it;
1173 break;
bbce6d69 1174 case NBOUNDL:
3280af22 1175 PL_reg_flags |= RF_tainted;
bbce6d69 1176 /* FALL THROUGH */
a0d0e21e 1177 case NBOUND:
ffc61ed2 1178 if (do_utf8) {
12d33761 1179 if (s == PL_bostr)
ffc61ed2
JH
1180 tmp = '\n';
1181 else {
b4f7163a 1182 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1183
b4f7163a 1184 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1185 }
1186 tmp = ((OP(c) == NBOUND ?
9041c2e3 1187 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1188 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1189 while (s < strend) {
1190 if (tmp == !(OP(c) == NBOUND ?
3568d838 1191 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1192 isALNUM_LC_utf8((U8*)s)))
1193 tmp = !tmp;
1194 else if ((norun || regtry(prog, s)))
1195 goto got_it;
1196 s += UTF8SKIP(s);
1197 }
a0d0e21e 1198 }
667bb95a 1199 else {
12d33761 1200 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1201 tmp = ((OP(c) == NBOUND ?
1202 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1203 while (s < strend) {
1204 if (tmp ==
1205 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1206 tmp = !tmp;
1207 else if ((norun || regtry(prog, s)))
1208 goto got_it;
1209 s++;
1210 }
a0ed51b3 1211 }
6eb5f6b9 1212 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1213 goto got_it;
1214 break;
a0d0e21e 1215 case ALNUM:
ffc61ed2 1216 if (do_utf8) {
8269fa76 1217 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1218 while (s < strend) {
3568d838 1219 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1220 if (tmp && (norun || regtry(prog, s)))
1221 goto got_it;
1222 else
1223 tmp = doevery;
1224 }
bbce6d69 1225 else
ffc61ed2
JH
1226 tmp = 1;
1227 s += UTF8SKIP(s);
bbce6d69 1228 }
bbce6d69 1229 }
ffc61ed2
JH
1230 else {
1231 while (s < strend) {
1232 if (isALNUM(*s)) {
1233 if (tmp && (norun || regtry(prog, s)))
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
a0ed51b3 1238 else
ffc61ed2
JH
1239 tmp = 1;
1240 s++;
a0ed51b3 1241 }
a0ed51b3
LW
1242 }
1243 break;
bbce6d69 1244 case ALNUML:
3280af22 1245 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1246 if (do_utf8) {
1247 while (s < strend) {
1248 if (isALNUM_LC_utf8((U8*)s)) {
1249 if (tmp && (norun || regtry(prog, s)))
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
a687059c 1254 else
ffc61ed2
JH
1255 tmp = 1;
1256 s += UTF8SKIP(s);
a0d0e21e 1257 }
a0d0e21e 1258 }
ffc61ed2
JH
1259 else {
1260 while (s < strend) {
1261 if (isALNUM_LC(*s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a0ed51b3 1267 else
ffc61ed2
JH
1268 tmp = 1;
1269 s++;
a0ed51b3 1270 }
a0ed51b3
LW
1271 }
1272 break;
a0d0e21e 1273 case NALNUM:
ffc61ed2 1274 if (do_utf8) {
8269fa76 1275 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1276 while (s < strend) {
3568d838 1277 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1278 if (tmp && (norun || regtry(prog, s)))
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
bbce6d69 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s += UTF8SKIP(s);
bbce6d69 1286 }
bbce6d69 1287 }
ffc61ed2
JH
1288 else {
1289 while (s < strend) {
1290 if (!isALNUM(*s)) {
1291 if (tmp && (norun || regtry(prog, s)))
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
a0ed51b3 1296 else
ffc61ed2
JH
1297 tmp = 1;
1298 s++;
a0ed51b3 1299 }
a0ed51b3
LW
1300 }
1301 break;
bbce6d69 1302 case NALNUML:
3280af22 1303 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1304 if (do_utf8) {
1305 while (s < strend) {
1306 if (!isALNUM_LC_utf8((U8*)s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a687059c 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s += UTF8SKIP(s);
a687059c 1315 }
a0d0e21e 1316 }
ffc61ed2
JH
1317 else {
1318 while (s < strend) {
1319 if (!isALNUM_LC(*s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a0ed51b3 1325 else
ffc61ed2
JH
1326 tmp = 1;
1327 s++;
a0ed51b3 1328 }
a0ed51b3
LW
1329 }
1330 break;
a0d0e21e 1331 case SPACE:
ffc61ed2 1332 if (do_utf8) {
8269fa76 1333 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1334 while (s < strend) {
3568d838 1335 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1336 if (tmp && (norun || regtry(prog, s)))
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0d0e21e 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s += UTF8SKIP(s);
2304df62 1344 }
a0d0e21e 1345 }
ffc61ed2
JH
1346 else {
1347 while (s < strend) {
1348 if (isSPACE(*s)) {
1349 if (tmp && (norun || regtry(prog, s)))
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0ed51b3 1354 else
ffc61ed2
JH
1355 tmp = 1;
1356 s++;
a0ed51b3 1357 }
a0ed51b3
LW
1358 }
1359 break;
bbce6d69 1360 case SPACEL:
3280af22 1361 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1362 if (do_utf8) {
1363 while (s < strend) {
1364 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1365 if (tmp && (norun || regtry(prog, s)))
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
bbce6d69 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s += UTF8SKIP(s);
bbce6d69 1373 }
bbce6d69 1374 }
ffc61ed2
JH
1375 else {
1376 while (s < strend) {
1377 if (isSPACE_LC(*s)) {
1378 if (tmp && (norun || regtry(prog, s)))
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
a0ed51b3 1383 else
ffc61ed2
JH
1384 tmp = 1;
1385 s++;
a0ed51b3 1386 }
a0ed51b3
LW
1387 }
1388 break;
a0d0e21e 1389 case NSPACE:
ffc61ed2 1390 if (do_utf8) {
8269fa76 1391 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1392 while (s < strend) {
3568d838 1393 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1394 if (tmp && (norun || regtry(prog, s)))
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0d0e21e 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s += UTF8SKIP(s);
a687059c 1402 }
a0d0e21e 1403 }
ffc61ed2
JH
1404 else {
1405 while (s < strend) {
1406 if (!isSPACE(*s)) {
1407 if (tmp && (norun || regtry(prog, s)))
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0ed51b3 1412 else
ffc61ed2
JH
1413 tmp = 1;
1414 s++;
a0ed51b3 1415 }
a0ed51b3
LW
1416 }
1417 break;
bbce6d69 1418 case NSPACEL:
3280af22 1419 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1420 if (do_utf8) {
1421 while (s < strend) {
1422 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1423 if (tmp && (norun || regtry(prog, s)))
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
bbce6d69 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s += UTF8SKIP(s);
bbce6d69 1431 }
bbce6d69 1432 }
ffc61ed2
JH
1433 else {
1434 while (s < strend) {
1435 if (!isSPACE_LC(*s)) {
1436 if (tmp && (norun || regtry(prog, s)))
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
a0ed51b3 1441 else
ffc61ed2
JH
1442 tmp = 1;
1443 s++;
a0ed51b3 1444 }
a0ed51b3
LW
1445 }
1446 break;
a0d0e21e 1447 case DIGIT:
ffc61ed2 1448 if (do_utf8) {
8269fa76 1449 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1450 while (s < strend) {
3568d838 1451 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1452 if (tmp && (norun || regtry(prog, s)))
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0d0e21e 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s += UTF8SKIP(s);
2b69d0c2 1460 }
a0d0e21e 1461 }
ffc61ed2
JH
1462 else {
1463 while (s < strend) {
1464 if (isDIGIT(*s)) {
1465 if (tmp && (norun || regtry(prog, s)))
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0ed51b3 1470 else
ffc61ed2
JH
1471 tmp = 1;
1472 s++;
a0ed51b3 1473 }
a0ed51b3
LW
1474 }
1475 break;
b8c5462f
JH
1476 case DIGITL:
1477 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1478 if (do_utf8) {
1479 while (s < strend) {
1480 if (isDIGIT_LC_utf8((U8*)s)) {
1481 if (tmp && (norun || regtry(prog, s)))
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
b8c5462f 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s += UTF8SKIP(s);
b8c5462f 1489 }
b8c5462f 1490 }
ffc61ed2
JH
1491 else {
1492 while (s < strend) {
1493 if (isDIGIT_LC(*s)) {
1494 if (tmp && (norun || regtry(prog, s)))
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2
JH
1500 tmp = 1;
1501 s++;
b8c5462f 1502 }
b8c5462f
JH
1503 }
1504 break;
a0d0e21e 1505 case NDIGIT:
ffc61ed2 1506 if (do_utf8) {
8269fa76 1507 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1508 while (s < strend) {
3568d838 1509 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1510 if (tmp && (norun || regtry(prog, s)))
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
a0d0e21e 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s += UTF8SKIP(s);
a687059c 1518 }
a0d0e21e 1519 }
ffc61ed2
JH
1520 else {
1521 while (s < strend) {
1522 if (!isDIGIT(*s)) {
1523 if (tmp && (norun || regtry(prog, s)))
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0ed51b3 1528 else
ffc61ed2
JH
1529 tmp = 1;
1530 s++;
a0ed51b3 1531 }
a0ed51b3
LW
1532 }
1533 break;
b8c5462f
JH
1534 case NDIGITL:
1535 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1536 if (do_utf8) {
1537 while (s < strend) {
1538 if (!isDIGIT_LC_utf8((U8*)s)) {
1539 if (tmp && (norun || regtry(prog, s)))
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
b8c5462f 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s += UTF8SKIP(s);
b8c5462f 1547 }
a0ed51b3 1548 }
ffc61ed2
JH
1549 else {
1550 while (s < strend) {
1551 if (!isDIGIT_LC(*s)) {
1552 if (tmp && (norun || regtry(prog, s)))
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
cf93c79d 1557 else
ffc61ed2
JH
1558 tmp = 1;
1559 s++;
b8c5462f 1560 }
b8c5462f
JH
1561 }
1562 break;
b3c9acc1 1563 default:
3c3eec57
GS
1564 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1565 break;
d6a28714 1566 }
6eb5f6b9
JH
1567 return 0;
1568 got_it:
1569 return s;
1570}
1571
1572/*
1573 - regexec_flags - match a regexp against a string
1574 */
1575I32
1576Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1577 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1578/* strend: pointer to null at end of string */
1579/* strbeg: real beginning of string */
1580/* minend: end of match must be >=minend after stringarg. */
1581/* data: May be used for some additional optimizations. */
1582/* nosave: For optimizations. */
1583{
6eb5f6b9
JH
1584 register char *s;
1585 register regnode *c;
1586 register char *startpos = stringarg;
6eb5f6b9
JH
1587 I32 minlen; /* must match at least this many chars */
1588 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1589 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1590 constant substr. */ /* CC */
1591 I32 end_shift = 0; /* Same for the end. */ /* CC */
1592 I32 scream_pos = -1; /* Internal iterator of scream. */
1593 char *scream_olds;
1594 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1595 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1596#ifdef DEBUGGING
9e55ce06
JH
1597 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1598 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1599#endif
6eb5f6b9
JH
1600
1601 PL_regcc = 0;
1602
1603 cache_re(prog);
1604#ifdef DEBUGGING
aea4f609 1605 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1606#endif
1607
1608 /* Be paranoid... */
1609 if (prog == NULL || startpos == NULL) {
1610 Perl_croak(aTHX_ "NULL regexp parameter");
1611 return 0;
1612 }
1613
1614 minlen = prog->minlen;
61a36c01 1615 if (strend - startpos < minlen) {
a72c7584
JH
1616 DEBUG_r(PerlIO_printf(Perl_debug_log,
1617 "String too short [regexec_flags]...\n"));
1618 goto phooey;
1aa99e6b 1619 }
6eb5f6b9 1620
6eb5f6b9
JH
1621 /* Check validity of program. */
1622 if (UCHARAT(prog->program) != REG_MAGIC) {
1623 Perl_croak(aTHX_ "corrupted regexp program");
1624 }
1625
1626 PL_reg_flags = 0;
1627 PL_reg_eval_set = 0;
1628 PL_reg_maxiter = 0;
1629
1630 if (prog->reganch & ROPT_UTF8)
1631 PL_reg_flags |= RF_utf8;
1632
1633 /* Mark beginning of line for ^ and lookbehind. */
1634 PL_regbol = startpos;
1635 PL_bostr = strbeg;
1636 PL_reg_sv = sv;
1637
1638 /* Mark end of line for $ (and such) */
1639 PL_regeol = strend;
1640
1641 /* see how far we have to get to not match where we matched before */
1642 PL_regtill = startpos+minend;
1643
1644 /* We start without call_cc context. */
1645 PL_reg_call_cc = 0;
1646
1647 /* If there is a "must appear" string, look for it. */
1648 s = startpos;
1649
1650 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1651 MAGIC *mg;
1652
1653 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1654 PL_reg_ganch = startpos;
1655 else if (sv && SvTYPE(sv) >= SVt_PVMG
1656 && SvMAGIC(sv)
14befaf4
DM
1657 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1658 && mg->mg_len >= 0) {
6eb5f6b9
JH
1659 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1660 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1661 if (s > PL_reg_ganch)
6eb5f6b9
JH
1662 goto phooey;
1663 s = PL_reg_ganch;
1664 }
1665 }
1666 else /* pos() not defined */
1667 PL_reg_ganch = strbeg;
1668 }
1669
33b8afdf 1670 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1671 re_scream_pos_data d;
1672
1673 d.scream_olds = &scream_olds;
1674 d.scream_pos = &scream_pos;
1675 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1676 if (!s) {
1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1678 goto phooey; /* not present */
3fa9c3d7 1679 }
6eb5f6b9
JH
1680 }
1681
2a782b5b 1682 DEBUG_r({
9e55ce06
JH
1683 char *s0 = UTF ?
1684 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1685 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1686 prog->precomp;
1687 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1688 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1689 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1690 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1691 if (!PL_colorset)
1692 reginitcolors();
1693 PerlIO_printf(Perl_debug_log,
9e55ce06 1694 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1695 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1696 len0, len0, s0,
2a782b5b 1697 PL_colors[1],
9e55ce06 1698 len0 > 60 ? "..." : "",
2a782b5b 1699 PL_colors[0],
9e55ce06
JH
1700 (int)(len1 > 60 ? 60 : len1),
1701 s1, PL_colors[1],
1702 (len1 > 60 ? "..." : "")
2a782b5b
JH
1703 );
1704 });
6eb5f6b9
JH
1705
1706 /* Simplest case: anchored match need be tried only once. */
1707 /* [unless only anchor is BOL and multiline is set] */
1708 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1709 if (s == startpos && regtry(prog, startpos))
1710 goto got_it;
1711 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1712 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1713 {
1714 char *end;
1715
1716 if (minlen)
1717 dontbother = minlen - 1;
1aa99e6b 1718 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1719 /* for multiline we only have to try after newlines */
33b8afdf 1720 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1721 if (s == startpos)
1722 goto after_try;
1723 while (1) {
1724 if (regtry(prog, s))
1725 goto got_it;
1726 after_try:
1727 if (s >= end)
1728 goto phooey;
1729 if (prog->reganch & RE_USE_INTUIT) {
1730 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1731 if (!s)
1732 goto phooey;
1733 }
1734 else
1735 s++;
1736 }
1737 } else {
1738 if (s > startpos)
1739 s--;
1740 while (s < end) {
1741 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1742 if (regtry(prog, s))
1743 goto got_it;
1744 }
1745 }
1746 }
1747 }
1748 goto phooey;
1749 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1750 if (regtry(prog, PL_reg_ganch))
1751 goto got_it;
1752 goto phooey;
1753 }
1754
1755 /* Messy cases: unanchored match. */
33b8afdf 1756 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1757 /* we have /x+whatever/ */
1758 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1759 char ch;
bf93d4cc
GS
1760#ifdef DEBUGGING
1761 int did_match = 0;
1762#endif
33b8afdf
JH
1763 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1764 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1765 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1766
1aa99e6b 1767 if (do_utf8) {
6eb5f6b9
JH
1768 while (s < strend) {
1769 if (*s == ch) {
bf93d4cc 1770 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1771 if (regtry(prog, s)) goto got_it;
1772 s += UTF8SKIP(s);
1773 while (s < strend && *s == ch)
1774 s += UTF8SKIP(s);
1775 }
1776 s += UTF8SKIP(s);
1777 }
1778 }
1779 else {
1780 while (s < strend) {
1781 if (*s == ch) {
bf93d4cc 1782 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1783 if (regtry(prog, s)) goto got_it;
1784 s++;
1785 while (s < strend && *s == ch)
1786 s++;
1787 }
1788 s++;
1789 }
1790 }
b7953727 1791 DEBUG_r(if (!did_match)
bf93d4cc 1792 PerlIO_printf(Perl_debug_log,
b7953727
JH
1793 "Did not find anchored character...\n")
1794 );
6eb5f6b9
JH
1795 }
1796 /*SUPPRESS 560*/
33b8afdf
JH
1797 else if (prog->anchored_substr != Nullsv
1798 || prog->anchored_utf8 != Nullsv
1799 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1800 && prog->float_max_offset < strend - s)) {
1801 SV *must;
1802 I32 back_max;
1803 I32 back_min;
1804 char *last;
6eb5f6b9 1805 char *last1; /* Last position checked before */
bf93d4cc
GS
1806#ifdef DEBUGGING
1807 int did_match = 0;
1808#endif
33b8afdf
JH
1809 if (prog->anchored_substr || prog->anchored_utf8) {
1810 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1811 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1812 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1813 back_max = back_min = prog->anchored_offset;
1814 } else {
1815 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1816 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1817 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1818 back_max = prog->float_max_offset;
1819 back_min = prog->float_min_offset;
1820 }
1821 if (must == &PL_sv_undef)
1822 /* could not downgrade utf8 check substring, so must fail */
1823 goto phooey;
1824
1825 last = HOP3c(strend, /* Cannot start after this */
1826 -(I32)(CHR_SVLEN(must)
1827 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1828
1829 if (s > PL_bostr)
1830 last1 = HOPc(s, -1);
1831 else
1832 last1 = s - 1; /* bogus */
1833
1834 /* XXXX check_substr already used to find `s', can optimize if
1835 check_substr==must. */
1836 scream_pos = -1;
1837 dontbother = end_shift;
1838 strend = HOPc(strend, -dontbother);
1839 while ( (s <= last) &&
9041c2e3 1840 ((flags & REXEC_SCREAM)
1aa99e6b 1841 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1842 end_shift, &scream_pos, 0))
1aa99e6b 1843 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1844 (unsigned char*)strend, must,
6eb5f6b9 1845 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
bf93d4cc 1846 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1847 if (HOPc(s, -back_max) > last1) {
1848 last1 = HOPc(s, -back_min);
1849 s = HOPc(s, -back_max);
1850 }
1851 else {
1852 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1853
1854 last1 = HOPc(s, -back_min);
1855 s = t;
1856 }
1aa99e6b 1857 if (do_utf8) {
6eb5f6b9
JH
1858 while (s <= last1) {
1859 if (regtry(prog, s))
1860 goto got_it;
1861 s += UTF8SKIP(s);
1862 }
1863 }
1864 else {
1865 while (s <= last1) {
1866 if (regtry(prog, s))
1867 goto got_it;
1868 s++;
1869 }
1870 }
1871 }
b7953727
JH
1872 DEBUG_r(if (!did_match)
1873 PerlIO_printf(Perl_debug_log,
1874 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1875 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1876 ? "anchored" : "floating"),
1877 PL_colors[0],
1878 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1879 SvPVX(must),
b7953727
JH
1880 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1881 );
6eb5f6b9
JH
1882 goto phooey;
1883 }
155aba94 1884 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1885 if (minlen) {
1886 I32 op = (U8)OP(prog->regstclass);
66e933ab 1887 /* don't bother with what can't match */
f14c76ed
RGS
1888 if (PL_regkind[op] != EXACT && op != CANY)
1889 strend = HOPc(strend, -(minlen - 1));
1890 }
ffc61ed2
JH
1891 DEBUG_r({
1892 SV *prop = sv_newmortal();
9e55ce06
JH
1893 char *s0;
1894 char *s1;
1895 int len0;
1896 int len1;
1897
ffc61ed2 1898 regprop(prop, c);
9e55ce06
JH
1899 s0 = UTF ?
1900 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1901 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1902 SvPVX(prop);
1903 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1904 s1 = UTF ?
c728cb41 1905 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1906 len1 = UTF ? SvCUR(dsv1) : strend - s;
1907 PerlIO_printf(Perl_debug_log,
1908 "Matching stclass `%*.*s' against `%*.*s'\n",
1909 len0, len0, s0,
1910 len1, len1, s1);
ffc61ed2 1911 });
6eb5f6b9
JH
1912 if (find_byclass(prog, c, s, strend, startpos, 0))
1913 goto got_it;
bf93d4cc 1914 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1915 }
1916 else {
1917 dontbother = 0;
33b8afdf
JH
1918 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1919 /* Trim the end. */
d6a28714 1920 char *last;
33b8afdf
JH
1921 SV* float_real;
1922
1923 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1924 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1925 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1926
1927 if (flags & REXEC_SCREAM) {
33b8afdf 1928 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1929 end_shift, &scream_pos, 1); /* last one */
1930 if (!last)
ffc61ed2 1931 last = scream_olds; /* Only one occurrence. */
b8c5462f 1932 }
d6a28714
JH
1933 else {
1934 STRLEN len;
33b8afdf 1935 char *little = SvPV(float_real, len);
d6a28714 1936
33b8afdf 1937 if (SvTAIL(float_real)) {
d6a28714
JH
1938 if (memEQ(strend - len + 1, little, len - 1))
1939 last = strend - len + 1;
1940 else if (!PL_multiline)
9041c2e3 1941 last = memEQ(strend - len, little, len)
d6a28714 1942 ? strend - len : Nullch;
b8c5462f 1943 else
d6a28714
JH
1944 goto find_last;
1945 } else {
1946 find_last:
9041c2e3 1947 if (len)
d6a28714 1948 last = rninstr(s, strend, little, little + len);
b8c5462f 1949 else
d6a28714 1950 last = strend; /* matching `$' */
b8c5462f 1951 }
b8c5462f 1952 }
bf93d4cc
GS
1953 if (last == NULL) {
1954 DEBUG_r(PerlIO_printf(Perl_debug_log,
1955 "%sCan't trim the tail, match fails (should not happen)%s\n",
1956 PL_colors[4],PL_colors[5]));
1957 goto phooey; /* Should not happen! */
1958 }
d6a28714
JH
1959 dontbother = strend - last + prog->float_min_offset;
1960 }
1961 if (minlen && (dontbother < minlen))
1962 dontbother = minlen - 1;
1963 strend -= dontbother; /* this one's always in bytes! */
1964 /* We don't know much -- general case. */
1aa99e6b 1965 if (do_utf8) {
d6a28714
JH
1966 for (;;) {
1967 if (regtry(prog, s))
1968 goto got_it;
1969 if (s >= strend)
1970 break;
b8c5462f 1971 s += UTF8SKIP(s);
d6a28714
JH
1972 };
1973 }
1974 else {
1975 do {
1976 if (regtry(prog, s))
1977 goto got_it;
1978 } while (s++ < strend);
1979 }
1980 }
1981
1982 /* Failure. */
1983 goto phooey;
1984
1985got_it:
1986 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1987
1988 if (PL_reg_eval_set) {
1989 /* Preserve the current value of $^R */
1990 if (oreplsv != GvSV(PL_replgv))
1991 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1992 restored, the value remains
1993 the same. */
acfe0abc 1994 restore_pos(aTHX_ 0);
d6a28714
JH
1995 }
1996
1997 /* make sure $`, $&, $', and $digit will work later */
1998 if ( !(flags & REXEC_NOT_FIRST) ) {
1999 if (RX_MATCH_COPIED(prog)) {
2000 Safefree(prog->subbeg);
2001 RX_MATCH_COPIED_off(prog);
2002 }
2003 if (flags & REXEC_COPY_STR) {
2004 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2005
2006 s = savepvn(strbeg, i);
2007 prog->subbeg = s;
2008 prog->sublen = i;
2009 RX_MATCH_COPIED_on(prog);
2010 }
2011 else {
2012 prog->subbeg = strbeg;
2013 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2014 }
2015 }
9041c2e3 2016
d6a28714
JH
2017 return 1;
2018
2019phooey:
bf93d4cc
GS
2020 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2021 PL_colors[4],PL_colors[5]));
d6a28714 2022 if (PL_reg_eval_set)
acfe0abc 2023 restore_pos(aTHX_ 0);
d6a28714
JH
2024 return 0;
2025}
2026
2027/*
2028 - regtry - try match at specific point
2029 */
2030STATIC I32 /* 0 failure, 1 success */
2031S_regtry(pTHX_ regexp *prog, char *startpos)
2032{
d6a28714
JH
2033 register I32 i;
2034 register I32 *sp;
2035 register I32 *ep;
2036 CHECKPOINT lastcp;
2037
02db2b7b
IZ
2038#ifdef DEBUGGING
2039 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2040#endif
d6a28714
JH
2041 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2042 MAGIC *mg;
2043
2044 PL_reg_eval_set = RS_init;
2045 DEBUG_r(DEBUG_s(
b900a521
JH
2046 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2047 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2048 ));
e8347627 2049 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2050 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2051 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2052 SAVETMPS;
2053 /* Apparently this is not needed, judging by wantarray. */
e8347627 2054 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2055 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2056
2057 if (PL_reg_sv) {
2058 /* Make $_ available to executed code. */
2059 if (PL_reg_sv != DEFSV) {
4d1ff10f 2060 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2061 SAVESPTR(DEFSV);
2062 DEFSV = PL_reg_sv;
b8c5462f 2063 }
d6a28714 2064
9041c2e3 2065 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2066 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2067 /* prepare for quick setting of pos */
14befaf4
DM
2068 sv_magic(PL_reg_sv, (SV*)0,
2069 PERL_MAGIC_regex_global, Nullch, 0);
2070 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2071 mg->mg_len = -1;
b8c5462f 2072 }
d6a28714
JH
2073 PL_reg_magic = mg;
2074 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2075 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2076 }
09687e5a 2077 if (!PL_reg_curpm) {
0f79a09d 2078 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2079#ifdef USE_ITHREADS
2080 {
2081 SV* repointer = newSViv(0);
577e12cc 2082 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2083 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2084 av_push(PL_regex_padav,repointer);
2085 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2086 PL_regex_pad = AvARRAY(PL_regex_padav);
2087 }
2088#endif
2089 }
aaa362c4 2090 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2091 PL_reg_oldcurpm = PL_curpm;
2092 PL_curpm = PL_reg_curpm;
2093 if (RX_MATCH_COPIED(prog)) {
2094 /* Here is a serious problem: we cannot rewrite subbeg,
2095 since it may be needed if this match fails. Thus
2096 $` inside (?{}) could fail... */
2097 PL_reg_oldsaved = prog->subbeg;
2098 PL_reg_oldsavedlen = prog->sublen;
2099 RX_MATCH_COPIED_off(prog);
2100 }
2101 else
2102 PL_reg_oldsaved = Nullch;
2103 prog->subbeg = PL_bostr;
2104 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2105 }
2106 prog->startp[0] = startpos - PL_bostr;
2107 PL_reginput = startpos;
2108 PL_regstartp = prog->startp;
2109 PL_regendp = prog->endp;
2110 PL_reglastparen = &prog->lastparen;
a01268b5 2111 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2112 prog->lastparen = 0;
03994de8 2113 prog->lastcloseparen = 0;
d6a28714
JH
2114 PL_regsize = 0;
2115 DEBUG_r(PL_reg_starttry = startpos);
2116 if (PL_reg_start_tmpl <= prog->nparens) {
2117 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2118 if(PL_reg_start_tmp)
2119 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2120 else
2121 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2122 }
2123
2124 /* XXXX What this code is doing here?!!! There should be no need
2125 to do this again and again, PL_reglastparen should take care of
3dd2943c 2126 this! --ilya*/
dafc8851
JH
2127
2128 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2129 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2130 * PL_reglastparen), is not needed at all by the test suite
2131 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2132 * enough, for building DynaLoader, or otherwise this
2133 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2134 * will happen. Meanwhile, this code *is* needed for the
2135 * above-mentioned test suite tests to succeed. The common theme
2136 * on those tests seems to be returning null fields from matches.
2137 * --jhi */
dafc8851 2138#if 1
d6a28714
JH
2139 sp = prog->startp;
2140 ep = prog->endp;
2141 if (prog->nparens) {
eb160463 2142 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2143 *++sp = -1;
2144 *++ep = -1;
2145 }
2146 }
dafc8851 2147#endif
02db2b7b 2148 REGCP_SET(lastcp);
d6a28714
JH
2149 if (regmatch(prog->program + 1)) {
2150 prog->endp[0] = PL_reginput - PL_bostr;
2151 return 1;
2152 }
02db2b7b 2153 REGCP_UNWIND(lastcp);
d6a28714
JH
2154 return 0;
2155}
2156
02db2b7b
IZ
2157#define RE_UNWIND_BRANCH 1
2158#define RE_UNWIND_BRANCHJ 2
2159
2160union re_unwind_t;
2161
2162typedef struct { /* XX: makes sense to enlarge it... */
2163 I32 type;
2164 I32 prev;
2165 CHECKPOINT lastcp;
2166} re_unwind_generic_t;
2167
2168typedef struct {
2169 I32 type;
2170 I32 prev;
2171 CHECKPOINT lastcp;
2172 I32 lastparen;
2173 regnode *next;
2174 char *locinput;
2175 I32 nextchr;
2176#ifdef DEBUGGING
2177 int regindent;
2178#endif
2179} re_unwind_branch_t;
2180
2181typedef union re_unwind_t {
2182 I32 type;
2183 re_unwind_generic_t generic;
2184 re_unwind_branch_t branch;
2185} re_unwind_t;
2186
8ba1375e
MJD
2187#define sayYES goto yes
2188#define sayNO goto no
e0f9d4a8 2189#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2190#define sayYES_FINAL goto yes_final
2191#define sayYES_LOUD goto yes_loud
2192#define sayNO_FINAL goto no_final
2193#define sayNO_SILENT goto do_no
2194#define saySAME(x) if (x) goto yes; else goto no
2195
2196#define REPORT_CODE_OFF 24
2197
d6a28714
JH
2198/*
2199 - regmatch - main matching routine
2200 *
2201 * Conceptually the strategy is simple: check to see whether the current
2202 * node matches, call self recursively to see whether the rest matches,
2203 * and then act accordingly. In practice we make some effort to avoid
2204 * recursion, in particular by going through "ordinary" nodes (that don't
2205 * need to know whether the rest of the match failed) by a loop instead of
2206 * by recursion.
2207 */
2208/* [lwall] I've hoisted the register declarations to the outer block in order to
2209 * maybe save a little bit of pushing and popping on the stack. It also takes
2210 * advantage of machines that use a register save mask on subroutine entry.
2211 */
2212STATIC I32 /* 0 failure, 1 success */
2213S_regmatch(pTHX_ regnode *prog)
2214{
d6a28714
JH
2215 register regnode *scan; /* Current node. */
2216 regnode *next; /* Next node. */
2217 regnode *inner; /* Next node in internal branch. */
2218 register I32 nextchr; /* renamed nextchr - nextchar colides with
2219 function of same name */
2220 register I32 n; /* no or next */
b7953727
JH
2221 register I32 ln = 0; /* len or last */
2222 register char *s = Nullch; /* operand or save */
d6a28714 2223 register char *locinput = PL_reginput;
b7953727 2224 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2225 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2226 I32 unwind = 0;
b7953727 2227#if 0
02db2b7b 2228 I32 firstcp = PL_savestack_ix;
b7953727 2229#endif
53c4c00c 2230 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2231#ifdef DEBUGGING
ce333219
JH
2232 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2233 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2234 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2235#endif
02db2b7b 2236
d6a28714
JH
2237#ifdef DEBUGGING
2238 PL_regindent++;
2239#endif
2240
2241 /* Note that nextchr is a byte even in UTF */
2242 nextchr = UCHARAT(locinput);
2243 scan = prog;
2244 while (scan != NULL) {
8ba1375e 2245
2a782b5b 2246 DEBUG_r( {
d6a28714
JH
2247 SV *prop = sv_newmortal();
2248 int docolor = *PL_colors[0];
2249 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2250 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2251 /* The part of the string before starttry has one color
2252 (pref0_len chars), between starttry and current
2253 position another one (pref_len - pref0_len chars),
2254 after the current position the third one.
2255 We assume that pref0_len <= pref_len, otherwise we
2256 decrease pref0_len. */
9041c2e3 2257 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2258 ? (5 + taill) - l : locinput - PL_bostr;
2259 int pref0_len;
d6a28714 2260
df1ffd02 2261 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2262 pref_len++;
2263 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2264 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2265 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2266 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2267 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2268 l--;
d6a28714
JH
2269 if (pref0_len < 0)
2270 pref0_len = 0;
2271 if (pref0_len > pref_len)
2272 pref0_len = pref_len;
2273 regprop(prop, scan);
2a782b5b
JH
2274 {
2275 char *s0 =
f14c76ed 2276 do_utf8 && OP(scan) != CANY ?
2a782b5b 2277 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2278 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2279 locinput - pref_len;
df1ffd02 2280 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2281 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2282 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2283 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2284 locinput - pref_len + pref0_len;
df1ffd02 2285 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2286 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2287 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2288 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2289 locinput;
df1ffd02 2290 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2291 PerlIO_printf(Perl_debug_log,
2292 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2293 (IV)(locinput - PL_bostr),
2294 PL_colors[4],
2295 len0, s0,
2296 PL_colors[5],
2297 PL_colors[2],
2298 len1, s1,
2299 PL_colors[3],
2300 (docolor ? "" : "> <"),
2301 PL_colors[0],
2302 len2, s2,
2303 PL_colors[1],
2304 15 - l - pref_len + 1,
2305 "",
2306 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2307 SvPVX(prop));
2308 }
2309 });
d6a28714
JH
2310
2311 next = scan + NEXT_OFF(scan);
2312 if (next == scan)
2313 next = NULL;
2314
2315 switch (OP(scan)) {
2316 case BOL:
12d33761
HS
2317 if (locinput == PL_bostr || (PL_multiline &&
2318 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2319 {
2320 /* regtill = regbol; */
b8c5462f
JH
2321 break;
2322 }
d6a28714
JH
2323 sayNO;
2324 case MBOL:
12d33761
HS
2325 if (locinput == PL_bostr ||
2326 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2327 {
b8c5462f
JH
2328 break;
2329 }
d6a28714
JH
2330 sayNO;
2331 case SBOL:
c2a73568 2332 if (locinput == PL_bostr)
b8c5462f 2333 break;
d6a28714
JH
2334 sayNO;
2335 case GPOS:
2336 if (locinput == PL_reg_ganch)
2337 break;
2338 sayNO;
2339 case EOL:
2340 if (PL_multiline)
2341 goto meol;
2342 else
2343 goto seol;
2344 case MEOL:
2345 meol:
2346 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2347 sayNO;
b8c5462f 2348 break;
d6a28714
JH
2349 case SEOL:
2350 seol:
2351 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2352 sayNO;
d6a28714 2353 if (PL_regeol - locinput > 1)
b8c5462f 2354 sayNO;
b8c5462f 2355 break;
d6a28714
JH
2356 case EOS:
2357 if (PL_regeol != locinput)
b8c5462f 2358 sayNO;
d6a28714 2359 break;
ffc61ed2 2360 case SANY:
d6a28714 2361 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2362 sayNO;
f33976b4
DB
2363 if (do_utf8) {
2364 locinput += PL_utf8skip[nextchr];
2365 if (locinput > PL_regeol)
2366 sayNO;
2367 nextchr = UCHARAT(locinput);
2368 }
2369 else
2370 nextchr = UCHARAT(++locinput);
2371 break;
2372 case CANY:
2373 if (!nextchr && locinput >= PL_regeol)
2374 sayNO;
b8c5462f 2375 nextchr = UCHARAT(++locinput);
a0d0e21e 2376 break;
ffc61ed2 2377 case REG_ANY:
1aa99e6b
IH
2378 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2379 sayNO;
2380 if (do_utf8) {
b8c5462f 2381 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2382 if (locinput > PL_regeol)
2383 sayNO;
a0ed51b3 2384 nextchr = UCHARAT(locinput);
a0ed51b3 2385 }
1aa99e6b
IH
2386 else
2387 nextchr = UCHARAT(++locinput);
a0ed51b3 2388 break;
d6a28714 2389 case EXACT:
cd439c50
IZ
2390 s = STRING(scan);
2391 ln = STR_LEN(scan);
eb160463 2392 if (do_utf8 != UTF) {
bc517b45 2393 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2394 char *l = locinput;
2395 char *e = s + ln;
bc517b45 2396 STRLEN ulen;
a72c7584 2397
5ff6fc6d
JH
2398 if (do_utf8) {
2399 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2400 while (s < e) {
2401 if (l >= PL_regeol)
5ff6fc6d
JH
2402 sayNO;
2403 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2404 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2405 ckWARN(WARN_UTF8) ?
2406 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2407 sayNO;
bc517b45 2408 l += ulen;
5ff6fc6d 2409 s ++;
1aa99e6b 2410 }
5ff6fc6d
JH
2411 }
2412 else {
2413 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2414 while (s < e) {
2415 if (l >= PL_regeol)
2416 sayNO;
5ff6fc6d 2417 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2418 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2419 ckWARN(WARN_UTF8) ?
2420 0 : UTF8_ALLOW_ANY))
1aa99e6b 2421 sayNO;
bc517b45 2422 s += ulen;
a72c7584 2423 l ++;
1aa99e6b 2424 }
5ff6fc6d 2425 }
1aa99e6b
IH
2426 locinput = l;
2427 nextchr = UCHARAT(locinput);
2428 break;
2429 }
bc517b45 2430 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2431 /* Inline the first character, for speed. */
2432 if (UCHARAT(s) != nextchr)
2433 sayNO;
2434 if (PL_regeol - locinput < ln)
2435 sayNO;
2436 if (ln > 1 && memNE(s, locinput, ln))
2437 sayNO;
2438 locinput += ln;
2439 nextchr = UCHARAT(locinput);
2440 break;
2441 case EXACTFL:
b8c5462f
JH
2442 PL_reg_flags |= RF_tainted;
2443 /* FALL THROUGH */
d6a28714 2444 case EXACTF:
cd439c50
IZ
2445 s = STRING(scan);
2446 ln = STR_LEN(scan);
d6a28714 2447
d07ddd77
JH
2448 if (do_utf8 || UTF) {
2449 /* Either target or the pattern are utf8. */
d6a28714 2450 char *l = locinput;
d07ddd77 2451 char *e = PL_regeol;
bc517b45 2452
eb160463 2453 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2454 l, &e, 0, do_utf8)) {
5486206c
JH
2455 /* One more case for the sharp s:
2456 * pack("U0U*", 0xDF) =~ /ss/i,
2457 * the 0xC3 0x9F are the UTF-8
2458 * byte sequence for the U+00DF. */
2459 if (!(do_utf8 &&
2460 toLOWER(s[0]) == 's' &&
2461 ln >= 2 &&
2462 toLOWER(s[1]) == 's' &&
2463 (U8)l[0] == 0xC3 &&
2464 e - l >= 2 &&
2465 (U8)l[1] == 0x9F))
2466 sayNO;
2467 }
d07ddd77
JH
2468 locinput = e;
2469 nextchr = UCHARAT(locinput);
2470 break;
a0ed51b3 2471 }
d6a28714 2472
bc517b45
JH
2473 /* Neither the target and the pattern are utf8. */
2474
d6a28714
JH
2475 /* Inline the first character, for speed. */
2476 if (UCHARAT(s) != nextchr &&
2477 UCHARAT(s) != ((OP(scan) == EXACTF)
2478 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2479 sayNO;
d6a28714 2480 if (PL_regeol - locinput < ln)
b8c5462f 2481 sayNO;
d6a28714
JH
2482 if (ln > 1 && (OP(scan) == EXACTF
2483 ? ibcmp(s, locinput, ln)
2484 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2485 sayNO;
d6a28714
JH
2486 locinput += ln;
2487 nextchr = UCHARAT(locinput);
a0d0e21e 2488 break;
d6a28714 2489 case ANYOF:
ffc61ed2 2490 if (do_utf8) {
9e55ce06
JH
2491 STRLEN inclasslen = PL_regeol - locinput;
2492
ba7b4546 2493 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2494 sayNO_ANYOF;
ffc61ed2
JH
2495 if (locinput >= PL_regeol)
2496 sayNO;
0f0076b4 2497 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2498 nextchr = UCHARAT(locinput);
e0f9d4a8 2499 break;
ffc61ed2
JH
2500 }
2501 else {
2502 if (nextchr < 0)
2503 nextchr = UCHARAT(locinput);
7d3e948e 2504 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2505 sayNO_ANYOF;
ffc61ed2
JH
2506 if (!nextchr && locinput >= PL_regeol)
2507 sayNO;
2508 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2509 break;
2510 }
2511 no_anyof:
2512 /* If we might have the case of the German sharp s
2513 * in a casefolding Unicode character class. */
2514
ebc501f0
JH
2515 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2516 locinput += SHARP_S_SKIP;
e0f9d4a8 2517 nextchr = UCHARAT(locinput);
ffc61ed2 2518 }
e0f9d4a8
JH
2519 else
2520 sayNO;
b8c5462f 2521 break;
d6a28714 2522 case ALNUML:
b8c5462f
JH
2523 PL_reg_flags |= RF_tainted;
2524 /* FALL THROUGH */
d6a28714 2525 case ALNUM:
b8c5462f 2526 if (!nextchr)
4633a7c4 2527 sayNO;
ffc61ed2 2528 if (do_utf8) {
ad24be35 2529 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2530 if (!(OP(scan) == ALNUM
3568d838 2531 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2532 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2533 {
2534 sayNO;
a0ed51b3 2535 }
b8c5462f 2536 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2537 nextchr = UCHARAT(locinput);
2538 break;
2539 }
ffc61ed2 2540 if (!(OP(scan) == ALNUM
d6a28714 2541 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2542 sayNO;
b8c5462f 2543 nextchr = UCHARAT(++locinput);
a0d0e21e 2544 break;
d6a28714 2545 case NALNUML:
b8c5462f
JH
2546 PL_reg_flags |= RF_tainted;
2547 /* FALL THROUGH */
d6a28714
JH
2548 case NALNUM:
2549 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2550 sayNO;
ffc61ed2 2551 if (do_utf8) {
8269fa76 2552 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2553 if (OP(scan) == NALNUM
3568d838 2554 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2555 : isALNUM_LC_utf8((U8*)locinput))
2556 {
b8c5462f 2557 sayNO;
d6a28714 2558 }
b8c5462f
JH
2559 locinput += PL_utf8skip[nextchr];
2560 nextchr = UCHARAT(locinput);
2561 break;
2562 }
ffc61ed2 2563 if (OP(scan) == NALNUM
d6a28714 2564 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2565 sayNO;
76e3520e 2566 nextchr = UCHARAT(++locinput);
a0d0e21e 2567 break;
d6a28714
JH
2568 case BOUNDL:
2569 case NBOUNDL:
3280af22 2570 PL_reg_flags |= RF_tainted;
bbce6d69 2571 /* FALL THROUGH */
d6a28714
JH
2572 case BOUND:
2573 case NBOUND:
2574 /* was last char in word? */
ffc61ed2 2575 if (do_utf8) {
12d33761
HS
2576 if (locinput == PL_bostr)
2577 ln = '\n';
ffc61ed2 2578 else {
b4f7163a 2579 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2580
b4f7163a 2581 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2582 }
2583 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2584 ln = isALNUM_uni(ln);
8269fa76 2585 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2586 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2587 }
2588 else {
9041c2e3 2589 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2590 n = isALNUM_LC_utf8((U8*)locinput);
2591 }
a0ed51b3 2592 }
d6a28714 2593 else {
12d33761
HS
2594 ln = (locinput != PL_bostr) ?
2595 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2596 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2597 ln = isALNUM(ln);
2598 n = isALNUM(nextchr);
2599 }
2600 else {
2601 ln = isALNUM_LC(ln);
2602 n = isALNUM_LC(nextchr);
2603 }
d6a28714 2604 }
ffc61ed2
JH
2605 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2606 OP(scan) == BOUNDL))
2607 sayNO;
a0ed51b3 2608 break;
d6a28714 2609 case SPACEL:
3280af22 2610 PL_reg_flags |= RF_tainted;
bbce6d69 2611 /* FALL THROUGH */
d6a28714 2612 case SPACE:
9442cb0e 2613 if (!nextchr)
4633a7c4 2614 sayNO;
1aa99e6b 2615 if (do_utf8) {
fd400ab9 2616 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2617 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2618 if (!(OP(scan) == SPACE
3568d838 2619 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2620 : isSPACE_LC_utf8((U8*)locinput)))
2621 {
2622 sayNO;
2623 }
2624 locinput += PL_utf8skip[nextchr];
2625 nextchr = UCHARAT(locinput);
2626 break;
d6a28714 2627 }
ffc61ed2
JH
2628 if (!(OP(scan) == SPACE
2629 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2630 sayNO;
2631 nextchr = UCHARAT(++locinput);
2632 }
2633 else {
2634 if (!(OP(scan) == SPACE
2635 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2636 sayNO;
2637 nextchr = UCHARAT(++locinput);
a0ed51b3 2638 }
a0ed51b3 2639 break;
d6a28714 2640 case NSPACEL:
3280af22 2641 PL_reg_flags |= RF_tainted;
bbce6d69 2642 /* FALL THROUGH */
d6a28714 2643 case NSPACE:
9442cb0e 2644 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2645 sayNO;
1aa99e6b 2646 if (do_utf8) {
8269fa76 2647 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2648 if (OP(scan) == NSPACE
3568d838 2649 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2650 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2651 {
2652 sayNO;
2653 }
2654 locinput += PL_utf8skip[nextchr];
2655 nextchr = UCHARAT(locinput);
2656 break;
a0ed51b3 2657 }
ffc61ed2 2658 if (OP(scan) == NSPACE
d6a28714 2659 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2660 sayNO;
76e3520e 2661 nextchr = UCHARAT(++locinput);
a0d0e21e 2662 break;
d6a28714 2663 case DIGITL:
a0ed51b3
LW
2664 PL_reg_flags |= RF_tainted;
2665 /* FALL THROUGH */
d6a28714 2666 case DIGIT:
9442cb0e 2667 if (!nextchr)
a0ed51b3 2668 sayNO;
1aa99e6b 2669 if (do_utf8) {
8269fa76 2670 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2671 if (!(OP(scan) == DIGIT
3568d838 2672 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2673 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2674 {
a0ed51b3 2675 sayNO;
dfe13c55 2676 }
6f06b55f 2677 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2678 nextchr = UCHARAT(locinput);
2679 break;
2680 }
ffc61ed2 2681 if (!(OP(scan) == DIGIT
9442cb0e 2682 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2683 sayNO;
2684 nextchr = UCHARAT(++locinput);
2685 break;
d6a28714 2686 case NDIGITL:
b8c5462f
JH
2687 PL_reg_flags |= RF_tainted;
2688 /* FALL THROUGH */
d6a28714 2689 case NDIGIT:
9442cb0e 2690 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2691 sayNO;
1aa99e6b 2692 if (do_utf8) {
8269fa76 2693 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2694 if (OP(scan) == NDIGIT
3568d838 2695 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2696 : isDIGIT_LC_utf8((U8*)locinput))
2697 {
a0ed51b3 2698 sayNO;
9442cb0e 2699 }
6f06b55f 2700 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2701 nextchr = UCHARAT(locinput);
2702 break;
2703 }
ffc61ed2 2704 if (OP(scan) == NDIGIT
9442cb0e 2705 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2706 sayNO;
2707 nextchr = UCHARAT(++locinput);
2708 break;
2709 case CLUMP:
b7c83a7e 2710 if (locinput >= PL_regeol)
a0ed51b3 2711 sayNO;
b7c83a7e
JH
2712 if (do_utf8) {
2713 LOAD_UTF8_CHARCLASS(mark,"~");
2714 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2715 sayNO;
2716 locinput += PL_utf8skip[nextchr];
2717 while (locinput < PL_regeol &&
2718 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2719 locinput += UTF8SKIP(locinput);
2720 if (locinput > PL_regeol)
2721 sayNO;
eb08e2da
JH
2722 }
2723 else
2724 locinput++;
a0ed51b3
LW
2725 nextchr = UCHARAT(locinput);
2726 break;
c8756f30 2727 case REFFL:
3280af22 2728 PL_reg_flags |= RF_tainted;
c8756f30 2729 /* FALL THROUGH */
c277df42 2730 case REF:
c8756f30 2731 case REFF:
c277df42 2732 n = ARG(scan); /* which paren pair */
cf93c79d 2733 ln = PL_regstartp[n];
2c2d71f5 2734 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2735 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2736 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2737 if (ln == PL_regendp[n])
a0d0e21e 2738 break;
a0ed51b3 2739
cf93c79d 2740 s = PL_bostr + ln;
1aa99e6b 2741 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2742 char *l = locinput;
cf93c79d 2743 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2744 /*
2745 * Note that we can't do the "other character" lookup trick as
2746 * in the 8-bit case (no pun intended) because in Unicode we
2747 * have to map both upper and title case to lower case.
2748 */
2749 if (OP(scan) == REFF) {
a2a2844f 2750 STRLEN ulen1, ulen2;
e7ae6809
JH
2751 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2752 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2753 while (s < e) {
2754 if (l >= PL_regeol)
2755 sayNO;
a2a2844f
JH
2756 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2757 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2758 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2759 sayNO;
a2a2844f
JH
2760 s += ulen1;
2761 l += ulen2;
a0ed51b3
LW
2762 }
2763 }
2764 locinput = l;
2765 nextchr = UCHARAT(locinput);
2766 break;
2767 }
2768
a0d0e21e 2769 /* Inline the first character, for speed. */
76e3520e 2770 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2771 (OP(scan) == REF ||
2772 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2773 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2774 sayNO;
cf93c79d 2775 ln = PL_regendp[n] - ln;
3280af22 2776 if (locinput + ln > PL_regeol)
4633a7c4 2777 sayNO;
c8756f30
AK
2778 if (ln > 1 && (OP(scan) == REF
2779 ? memNE(s, locinput, ln)
2780 : (OP(scan) == REFF
2781 ? ibcmp(s, locinput, ln)
2782 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2783 sayNO;
a0d0e21e 2784 locinput += ln;
76e3520e 2785 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2786 break;
2787
2788 case NOTHING:
c277df42 2789 case TAIL:
a0d0e21e
LW
2790 break;
2791 case BACK:
2792 break;
c277df42
IZ
2793 case EVAL:
2794 {
2795 dSP;
533c011a 2796 OP_4tree *oop = PL_op;
3280af22 2797 COP *ocurcop = PL_curcop;
f3548bdc 2798 PAD *old_comppad;
c277df42 2799 SV *ret;
9041c2e3 2800
c277df42 2801 n = ARG(scan);
533c011a 2802 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2803 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2804 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2805 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2806
8e5e9ebe
RGS
2807 {
2808 SV **before = SP;
2809 CALLRUNOPS(aTHX); /* Scalar context. */
2810 SPAGAIN;
2811 if (SP == before)
075aa684 2812 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2813 else {
2814 ret = POPs;
2815 PUTBACK;
2816 }
2817 }
2818
0f5d15d6 2819 PL_op = oop;
f3548bdc 2820 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2821 PL_curcop = ocurcop;
c277df42 2822 if (logical) {
0f5d15d6
IZ
2823 if (logical == 2) { /* Postponed subexpression. */
2824 regexp *re;
22c35a8c 2825 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2826 re_cc_state state;
0f5d15d6 2827 CHECKPOINT cp, lastcp;
cb50f42d 2828 int toggleutf;
0f5d15d6
IZ
2829
2830 if(SvROK(ret) || SvRMAGICAL(ret)) {
2831 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2832
2833 if(SvMAGICAL(sv))
14befaf4 2834 mg = mg_find(sv, PERL_MAGIC_qr);
0f5d15d6
IZ
2835 }
2836 if (mg) {
2837 re = (regexp *)mg->mg_obj;
df0003d4 2838 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2839 }
2840 else {
2841 STRLEN len;
2842 char *t = SvPV(ret, len);
2843 PMOP pm;
2844 char *oprecomp = PL_regprecomp;
2845 I32 osize = PL_regsize;
2846 I32 onpar = PL_regnpar;
2847
5fcd1c1b 2848 Zero(&pm, 1, PMOP);
cb50f42d 2849 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2850 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2851 if (!(SvFLAGS(ret)
0f5d15d6 2852 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
14befaf4
DM
2853 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2854 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2855 PL_regprecomp = oprecomp;
2856 PL_regsize = osize;
2857 PL_regnpar = onpar;
2858 }
2859 DEBUG_r(
9041c2e3 2860 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2861 "Entering embedded `%s%.60s%s%s'\n",
2862 PL_colors[0],
2863 re->precomp,
2864 PL_colors[1],
2865 (strlen(re->precomp) > 60 ? "..." : ""))
2866 );
2867 state.node = next;
2868 state.prev = PL_reg_call_cc;
2869 state.cc = PL_regcc;
2870 state.re = PL_reg_re;
2871
2ab05381 2872 PL_regcc = 0;
9041c2e3 2873
0f5d15d6 2874 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2875 REGCP_SET(lastcp);
0f5d15d6
IZ
2876 cache_re(re);
2877 state.ss = PL_savestack_ix;
2878 *PL_reglastparen = 0;
a01268b5 2879 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2880 PL_reg_call_cc = &state;
2881 PL_reginput = locinput;
cb50f42d
YST
2882 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2883 ((re->reganch & ROPT_UTF8) != 0);
2884 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2885
2886 /* XXXX This is too dramatic a measure... */
2887 PL_reg_maxiter = 0;
2888
0f5d15d6 2889 if (regmatch(re->program + 1)) {
2c914db6
IZ
2890 /* Even though we succeeded, we need to restore
2891 global variables, since we may be wrapped inside
2892 SUSPEND, thus the match may be not finished yet. */
2893
2894 /* XXXX Do this only if SUSPENDed? */
2895 PL_reg_call_cc = state.prev;
2896 PL_regcc = state.cc;
2897 PL_reg_re = state.re;
2898 cache_re(PL_reg_re);
cb50f42d 2899 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2900
2901 /* XXXX This is too dramatic a measure... */
2902 PL_reg_maxiter = 0;
2903
2904 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2905 ReREFCNT_dec(re);
2906 regcpblow(cp);
2907 sayYES;
2908 }
0f5d15d6 2909 ReREFCNT_dec(re);
02db2b7b 2910 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2911 regcppop();
2912 PL_reg_call_cc = state.prev;
2913 PL_regcc = state.cc;
2914 PL_reg_re = state.re;
d3790889 2915 cache_re(PL_reg_re);
cb50f42d 2916 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2917
2918 /* XXXX This is too dramatic a measure... */
2919 PL_reg_maxiter = 0;
2920
8e514ae6 2921 logical = 0;
0f5d15d6
IZ
2922 sayNO;
2923 }
c277df42 2924 sw = SvTRUE(ret);
0f5d15d6 2925 logical = 0;
a0ed51b3
LW
2926 }
2927 else
3280af22 2928 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2929 break;
2930 }
a0d0e21e 2931 case OPEN:
c277df42 2932 n = ARG(scan); /* which paren pair */
3280af22
NIS
2933 PL_reg_start_tmp[n] = locinput;
2934 if (n > PL_regsize)
2935 PL_regsize = n;
a0d0e21e
LW
2936 break;
2937 case CLOSE:
c277df42 2938 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2939 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2940 PL_regendp[n] = locinput - PL_bostr;
eb160463 2941 if (n > (I32)*PL_reglastparen)
3280af22 2942 *PL_reglastparen = n;
a01268b5 2943 *PL_reglastcloseparen = n;
a0d0e21e 2944 break;
c277df42
IZ
2945 case GROUPP:
2946 n = ARG(scan); /* which paren pair */
eb160463 2947 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2948 break;
2949 case IFTHEN:
2c2d71f5 2950 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2951 if (sw)
2952 next = NEXTOPER(NEXTOPER(scan));
2953 else {
2954 next = scan + ARG(scan);
2955 if (OP(next) == IFTHEN) /* Fake one. */
2956 next = NEXTOPER(NEXTOPER(next));
2957 }
2958 break;
2959 case LOGICAL:
0f5d15d6 2960 logical = scan->flags;
c277df42 2961 break;
2ab05381
IZ
2962/*******************************************************************
2963 PL_regcc contains infoblock about the innermost (...)* loop, and
2964 a pointer to the next outer infoblock.
2965
2966 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2967
2968 1) After matching X, regnode for CURLYX is processed;
2969
9041c2e3 2970 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
2971 regmatch() recursively with the starting point at WHILEM node;
2972
2973 3) Each hit of WHILEM node tries to match A and Z (in the order
2974 depending on the current iteration, min/max of {min,max} and
2975 greediness). The information about where are nodes for "A"
2976 and "Z" is read from the infoblock, as is info on how many times "A"
2977 was already matched, and greediness.
2978
2979 4) After A matches, the same WHILEM node is hit again.
2980
2981 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2982 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2983 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2984 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2985 of the external loop.
2986
2987 Currently present infoblocks form a tree with a stem formed by PL_curcc
2988 and whatever it mentions via ->next, and additional attached trees
2989 corresponding to temporarily unset infoblocks as in "5" above.
2990
9041c2e3 2991 In the following picture infoblocks for outer loop of
2ab05381
IZ
2992 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2993 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2994 infoblocks are drawn below the "reset" infoblock.
2995
2996 In fact in the picture below we do not show failed matches for Z and T
2997 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2998 more obvious *why* one needs to *temporary* unset infoblocks.]
2999
3000 Matched REx position InfoBlocks Comment
3001 (Y(A)*?Z)*?T x
3002 Y(A)*?Z)*?T x <- O
3003 Y (A)*?Z)*?T x <- O
3004 Y A)*?Z)*?T x <- O <- I
3005 YA )*?Z)*?T x <- O <- I
3006 YA A)*?Z)*?T x <- O <- I
3007 YAA )*?Z)*?T x <- O <- I
3008 YAA Z)*?T x <- O # Temporary unset I
3009 I
3010
3011 YAAZ Y(A)*?Z)*?T x <- O
3012 I
3013
3014 YAAZY (A)*?Z)*?T x <- O
3015 I
3016
3017 YAAZY A)*?Z)*?T x <- O <- I
3018 I
3019
3020 YAAZYA )*?Z)*?T x <- O <- I
3021 I
3022
3023 YAAZYA Z)*?T x <- O # Temporary unset I
3024 I,I
3025
3026 YAAZYAZ )*?T x <- O
3027 I,I
3028
3029 YAAZYAZ T x # Temporary unset O
3030 O
3031 I,I
3032
3033 YAAZYAZT x
3034 O
3035 I,I
3036 *******************************************************************/
a0d0e21e
LW
3037 case CURLYX: {
3038 CURCUR cc;
3280af22 3039 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3040 /* No need to save/restore up to this paren */
3041 I32 parenfloor = scan->flags;
c277df42
IZ
3042
3043 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3044 next += ARG(next);
3280af22
NIS
3045 cc.oldcc = PL_regcc;
3046 PL_regcc = &cc;
cb434fcc
IZ
3047 /* XXXX Probably it is better to teach regpush to support
3048 parenfloor > PL_regsize... */
eb160463 3049 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3050 parenfloor = *PL_reglastparen; /* Pessimization... */
3051 cc.parenfloor = parenfloor;
a0d0e21e
LW
3052 cc.cur = -1;
3053 cc.min = ARG1(scan);
3054 cc.max = ARG2(scan);
c277df42 3055 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3056 cc.next = next;
3057 cc.minmod = minmod;
3058 cc.lastloc = 0;
3280af22 3059 PL_reginput = locinput;
a0d0e21e
LW
3060 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3061 regcpblow(cp);
3280af22 3062 PL_regcc = cc.oldcc;
4633a7c4 3063 saySAME(n);
a0d0e21e
LW
3064 }
3065 /* NOT REACHED */
3066 case WHILEM: {
3067 /*
3068 * This is really hard to understand, because after we match
3069 * what we're trying to match, we must make sure the rest of
2c2d71f5 3070 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3071 * to go back UP the parse tree by recursing ever deeper. And
3072 * if it fails, we have to reset our parent's current state
3073 * that we can try again after backing off.
3074 */
3075
c277df42 3076 CHECKPOINT cp, lastcp;
3280af22 3077 CURCUR* cc = PL_regcc;
c277df42
IZ
3078 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3079
4633a7c4 3080 n = cc->cur + 1; /* how many we know we matched */
3280af22 3081 PL_reginput = locinput;
a0d0e21e 3082
c277df42 3083 DEBUG_r(
9041c2e3 3084 PerlIO_printf(Perl_debug_log,
91f3b821 3085 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3086 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3087 (long)n, (long)cc->min,
2797576d 3088 (long)cc->max, PTR2UV(cc))
c277df42 3089 );
4633a7c4 3090
a0d0e21e
LW
3091 /* If degenerate scan matches "", assume scan done. */
3092
579cf2c3 3093 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3094 PL_regcc = cc->oldcc;
2ab05381
IZ
3095 if (PL_regcc)
3096 ln = PL_regcc->cur;
c277df42 3097 DEBUG_r(
c3464db5
DD
3098 PerlIO_printf(Perl_debug_log,
3099 "%*s empty match detected, try continuation...\n",
3280af22 3100 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3101 );
a0d0e21e 3102 if (regmatch(cc->next))
4633a7c4 3103 sayYES;
2ab05381
IZ
3104 if (PL_regcc)
3105 PL_regcc->cur = ln;
3280af22 3106 PL_regcc = cc;
4633a7c4 3107 sayNO;
a0d0e21e
LW
3108 }
3109
3110 /* First just match a string of min scans. */
3111
3112 if (n < cc->min) {
3113 cc->cur = n;
3114 cc->lastloc = locinput;
4633a7c4
LW
3115 if (regmatch(cc->scan))
3116 sayYES;
3117 cc->cur = n - 1;
c277df42 3118 cc->lastloc = lastloc;
4633a7c4 3119 sayNO;
a0d0e21e
LW
3120 }
3121
2c2d71f5
JH
3122 if (scan->flags) {
3123 /* Check whether we already were at this position.
3124 Postpone detection until we know the match is not
3125 *that* much linear. */
3126 if (!PL_reg_maxiter) {
3127 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3128 PL_reg_leftiter = PL_reg_maxiter;
3129 }
3130 if (PL_reg_leftiter-- == 0) {
3131 I32 size = (PL_reg_maxiter + 7)/8;
3132 if (PL_reg_poscache) {
eb160463 3133 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3134 Renew(PL_reg_poscache, size, char);
3135 PL_reg_poscache_size = size;
3136 }
3137 Zero(PL_reg_poscache, size, char);
3138 }
3139 else {
3140 PL_reg_poscache_size = size;
3141 Newz(29, PL_reg_poscache, size, char);
3142 }
3143 DEBUG_r(
3144 PerlIO_printf(Perl_debug_log,
3145 "%sDetected a super-linear match, switching on caching%s...\n",
3146 PL_colors[4], PL_colors[5])
3147 );
3148 }
3149 if (PL_reg_leftiter < 0) {
3150 I32 o = locinput - PL_bostr, b;
3151
3152 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3153 b = o % 8;
3154 o /= 8;
3155 if (PL_reg_poscache[o] & (1<<b)) {
3156 DEBUG_r(
3157 PerlIO_printf(Perl_debug_log,
3158 "%*s already tried at this position...\n",
3159 REPORT_CODE_OFF+PL_regindent*2, "")
3160 );
7821416a 3161 sayNO_SILENT;
2c2d71f5
JH
3162 }
3163 PL_reg_poscache[o] |= (1<<b);
3164 }
3165 }
3166
a0d0e21e
LW
3167 /* Prefer next over scan for minimal matching. */
3168
3169 if (cc->minmod) {
3280af22 3170 PL_regcc = cc->oldcc;
2ab05381
IZ
3171 if (PL_regcc)
3172 ln = PL_regcc->cur;
5f05dabc 3173 cp = regcppush(cc->parenfloor);
02db2b7b 3174 REGCP_SET(lastcp);
5f05dabc 3175 if (regmatch(cc->next)) {
c277df42 3176 regcpblow(cp);
4633a7c4 3177 sayYES; /* All done. */
5f05dabc 3178 }
02db2b7b 3179 REGCP_UNWIND(lastcp);
5f05dabc 3180 regcppop();
2ab05381
IZ
3181 if (PL_regcc)
3182 PL_regcc->cur = ln;
3280af22 3183 PL_regcc = cc;
a0d0e21e 3184
c277df42 3185 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3186 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3187 && !(PL_reg_flags & RF_warned)) {
3188 PL_reg_flags |= RF_warned;
9014280d 3189 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3190 "Complex regular subexpression recursion",
3191 REG_INFTY - 1);
c277df42 3192 }
4633a7c4 3193 sayNO;
c277df42 3194 }
a687059c 3195
c277df42 3196 DEBUG_r(
c3464db5
DD
3197 PerlIO_printf(Perl_debug_log,
3198 "%*s trying longer...\n",
3280af22 3199 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3200 );
a0d0e21e 3201 /* Try scanning more and see if it helps. */
3280af22 3202 PL_reginput = locinput;
a0d0e21e
LW
3203 cc->cur = n;
3204 cc->lastloc = locinput;
5f05dabc 3205 cp = regcppush(cc->parenfloor);
02db2b7b 3206 REGCP_SET(lastcp);
5f05dabc 3207 if (regmatch(cc->scan)) {
c277df42 3208 regcpblow(cp);
4633a7c4 3209 sayYES;
5f05dabc 3210 }
02db2b7b 3211 REGCP_UNWIND(lastcp);
5f05dabc 3212 regcppop();
4633a7c4 3213 cc->cur = n - 1;
c277df42 3214 cc->lastloc = lastloc;
4633a7c4 3215 sayNO;
a0d0e21e
LW
3216 }
3217
3218 /* Prefer scan over next for maximal matching. */
3219
3220 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3221 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3222 cc->cur = n;
3223 cc->lastloc = locinput;
02db2b7b 3224 REGCP_SET(lastcp);
5f05dabc 3225 if (regmatch(cc->scan)) {
c277df42 3226 regcpblow(cp);
4633a7c4 3227 sayYES;
5f05dabc 3228 }
02db2b7b 3229 REGCP_UNWIND(lastcp);
a0d0e21e 3230 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3231 PL_reginput = locinput;
c277df42 3232 DEBUG_r(
c3464db5
DD
3233 PerlIO_printf(Perl_debug_log,
3234 "%*s failed, try continuation...\n",
3280af22 3235 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3236 );
3237 }
9041c2e3 3238 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3239 && !(PL_reg_flags & RF_warned)) {
3280af22 3240 PL_reg_flags |= RF_warned;
9014280d 3241 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3242 "Complex regular subexpression recursion",
3243 REG_INFTY - 1);
a0d0e21e
LW
3244 }
3245
3246 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3247 PL_regcc = cc->oldcc;
2ab05381
IZ
3248 if (PL_regcc)
3249 ln = PL_regcc->cur;
a0d0e21e 3250 if (regmatch(cc->next))
4633a7c4 3251 sayYES;
2ab05381
IZ
3252 if (PL_regcc)
3253 PL_regcc->cur = ln;
3280af22 3254 PL_regcc = cc;
4633a7c4 3255 cc->cur = n - 1;
c277df42 3256 cc->lastloc = lastloc;
4633a7c4 3257 sayNO;
a0d0e21e
LW
3258 }
3259 /* NOT REACHED */
9041c2e3 3260 case BRANCHJ:
c277df42
IZ
3261 next = scan + ARG(scan);
3262 if (next == scan)
3263 next = NULL;
3264 inner = NEXTOPER(NEXTOPER(scan));
3265 goto do_branch;
9041c2e3 3266 case BRANCH:
c277df42
IZ
3267 inner = NEXTOPER(scan);
3268 do_branch:
3269 {
c277df42
IZ
3270 c1 = OP(scan);
3271 if (OP(next) != c1) /* No choice. */
3272 next = inner; /* Avoid recursion. */
a0d0e21e 3273 else {
02db2b7b
IZ
3274 I32 lastparen = *PL_reglastparen;
3275 I32 unwind1;
3276 re_unwind_branch_t *uw;
3277
3278 /* Put unwinding data on stack */
3279 unwind1 = SSNEWt(1,re_unwind_branch_t);
3280 uw = SSPTRt(unwind1,re_unwind_branch_t);
3281 uw->prev = unwind;
3282 unwind = unwind1;
3283 uw->type = ((c1 == BRANCH)
3284 ? RE_UNWIND_BRANCH
3285 : RE_UNWIND_BRANCHJ);
3286 uw->lastparen = lastparen;
3287 uw->next = next;
3288 uw->locinput = locinput;
3289 uw->nextchr = nextchr;
3290#ifdef DEBUGGING
3291 uw->regindent = ++PL_regindent;
3292#endif
c277df42 3293
02db2b7b
IZ
3294 REGCP_SET(uw->lastcp);
3295
3296 /* Now go into the first branch */
3297 next = inner;
a687059c 3298 }
a0d0e21e
LW
3299 }
3300 break;
3301 case MINMOD:
3302 minmod = 1;
3303 break;
c277df42
IZ
3304 case CURLYM:
3305 {
00db4c45 3306 I32 l = 0;
c277df42 3307 CHECKPOINT lastcp;
9041c2e3 3308
c277df42
IZ
3309 /* We suppose that the next guy does not need
3310 backtracking: in particular, it is of constant length,
3311 and has no parenths to influence future backrefs. */
3312 ln = ARG1(scan); /* min to match */
3313 n = ARG2(scan); /* max to match */
c277df42
IZ
3314 paren = scan->flags;
3315 if (paren) {
3280af22
NIS
3316 if (paren > PL_regsize)
3317 PL_regsize = paren;
eb160463 3318 if (paren > (I32)*PL_reglastparen)
3280af22 3319 *PL_reglastparen = paren;
c277df42 3320 }
dc45a647 3321 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3322 if (paren)
3323 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3324 PL_reginput = locinput;
c277df42
IZ
3325 if (minmod) {
3326 minmod = 0;
3327 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3328 sayNO;
f31a99c8
HS
3329 /* if we matched something zero-length we don't need to
3330 backtrack - capturing parens are already defined, so
3331 the caveat in the maximal case doesn't apply
3332
3333 XXXX if ln == 0, we can redo this check first time
3334 through the following loop
3335 */
3336 if (ln && l == 0)
3337 n = ln; /* don't backtrack */
3280af22 3338 locinput = PL_reginput;
cca55fe3 3339 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3340 regnode *text_node = next;
3341
cca55fe3 3342 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3343
cca55fe3 3344 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3345 else {
cca55fe3
JP
3346 if (PL_regkind[(U8)OP(text_node)] == REF) {
3347 I32 n, ln;
3348 n = ARG(text_node); /* which paren pair */
3349 ln = PL_regstartp[n];
3350 /* assume yes if we haven't seen CLOSEn */
3351 if (
eb160463 3352 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3353 ln == -1 ||
3354 ln == PL_regendp[n]
3355 ) {
3356 c1 = c2 = -1000;
3357 goto assume_ok_MM;
3358 }
3359 c1 = *(PL_bostr + ln);
3360 }
3361 else { c1 = (U8)*STRING(text_node); }
af5decee 3362 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3363 c2 = PL_fold[c1];
af5decee 3364 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3365 c2 = PL_fold_locale[c1];
3366 else
3367 c2 = c1;
3368 }
a0ed51b3
LW
3369 }
3370 else
c277df42 3371 c1 = c2 = -1000;
cca55fe3 3372 assume_ok_MM:
02db2b7b 3373 REGCP_SET(lastcp);
5f4b28b2 3374 /* This may be improved if l == 0. */
c277df42
IZ
3375 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3376 /* If it could work, try it. */
3377 if (c1 == -1000 ||
3280af22
NIS
3378 UCHARAT(PL_reginput) == c1 ||
3379 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3380 {
3381 if (paren) {
f31a99c8 3382 if (ln) {
cf93c79d
IZ
3383 PL_regstartp[paren] =
3384 HOPc(PL_reginput, -l) - PL_bostr;
3385 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3386 }
3387 else
cf93c79d 3388 PL_regendp[paren] = -1;
c277df42
IZ
3389 }
3390 if (regmatch(next))
3391 sayYES;
02db2b7b 3392 REGCP_UNWIND(lastcp);
c277df42
IZ
3393 }
3394 /* Couldn't or didn't -- move forward. */
3280af22 3395 PL_reginput = locinput;
c277df42
IZ
3396 if (regrepeat_hard(scan, 1, &l)) {
3397 ln++;
3280af22 3398 locinput = PL_reginput;
c277df42
IZ
3399 }
3400 else
3401 sayNO;
3402 }
a0ed51b3
LW
3403 }
3404 else {
c277df42 3405 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3406 /* if we matched something zero-length we don't need to
3407 backtrack, unless the minimum count is zero and we
3408 are capturing the result - in that case the capture
3409 being defined or not may affect later execution
3410 */
3411 if (n != 0 && l == 0 && !(paren && ln == 0))
3412 ln = n; /* don't backtrack */
3280af22 3413 locinput = PL_reginput;
c277df42 3414 DEBUG_r(
5c0ca799 3415 PerlIO_printf(Perl_debug_log,
faccc32b 3416 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3417 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3418 (IV) n, (IV)l)
c277df42
IZ
3419 );
3420 if (n >= ln) {
cca55fe3 3421 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3422 regnode *text_node = next;
3423
cca55fe3 3424 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3425
cca55fe3 3426 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3427 else {
cca55fe3
JP
3428 if (PL_regkind[(U8)OP(text_node)] == REF) {
3429 I32 n, ln;
3430 n = ARG(text_node); /* which paren pair */
3431 ln = PL_regstartp[n];
3432 /* assume yes if we haven't seen CLOSEn */
3433 if (
eb160463 3434 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3435 ln == -1 ||
3436 ln == PL_regendp[n]
3437 ) {
3438 c1 = c2 = -1000;
3439 goto assume_ok_REG;
3440 }
3441 c1 = *(PL_bostr + ln);
3442 }
3443 else { c1 = (U8)*STRING(text_node); }
3444
af5decee 3445 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3446 c2 = PL_fold[c1];
af5decee 3447 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3448 c2 = PL_fold_locale[c1];
3449 else
3450 c2 = c1;
3451 }
a0ed51b3
LW
3452 }
3453 else
c277df42
IZ
3454 c1 = c2 = -1000;
3455 }
cca55fe3 3456 assume_ok_REG:
02db2b7b 3457 REGCP_SET(lastcp);
c277df42
IZ
3458 while (n >= ln) {
3459 /* If it could work, try it. */
3460 if (c1 == -1000 ||
3280af22
NIS
3461 UCHARAT(PL_reginput) == c1 ||
3462 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3463 {
3464 DEBUG_r(
c3464db5 3465 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3466 "%*s trying tail with n=%"IVdf"...\n",
3467 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3468 );
3469 if (paren) {
3470 if (n) {
cf93c79d
IZ
3471 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3472 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3473 }
a0ed51b3 3474 else
cf93c79d 3475 PL_regendp[paren] = -1;
c277df42 3476 }
a0ed51b3
LW
3477 if (regmatch(next))
3478 sayYES;
02db2b7b 3479 REGCP_UNWIND(lastcp);
a0ed51b3 3480 }
c277df42
IZ
3481 /* Couldn't or didn't -- back up. */
3482 n--;
dfe13c55 3483 locinput = HOPc(locinput, -l);
3280af22 3484 PL_reginput = locinput;
c277df42
IZ
3485 }
3486 }
3487 sayNO;
3488 break;
3489 }
3490 case CURLYN:
3491 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3492 if (paren > PL_regsize)
3493 PL_regsize = paren;
eb160463 3494 if (paren > (I32)*PL_reglastparen)
3280af22 3495 *PL_reglastparen = paren;
c277df42
IZ
3496 ln = ARG1(scan); /* min to match */
3497 n = ARG2(scan); /* max to match */
dc45a647 3498 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3499 goto repeat;
a0d0e21e 3500 case CURLY:
c277df42 3501 paren = 0;
a0d0e21e
LW
3502 ln = ARG1(scan); /* min to match */
3503 n = ARG2(scan); /* max to match */
dc45a647 3504 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3505 goto repeat;
3506 case STAR:
3507 ln = 0;
c277df42 3508 n = REG_INFTY;
a0d0e21e 3509 scan = NEXTOPER(scan);
c277df42 3510 paren = 0;
a0d0e21e
LW
3511 goto repeat;
3512 case PLUS:
c277df42
IZ
3513 ln = 1;
3514 n = REG_INFTY;
3515 scan = NEXTOPER(scan);
3516 paren = 0;
3517 repeat:
a0d0e21e
LW
3518 /*
3519 * Lookahead to avoid useless match attempts
3520 * when we know what character comes next.
3521 */
5f80c4cf
JP
3522
3523 /*
3524 * Used to only do .*x and .*?x, but now it allows
3525 * for )'s, ('s and (?{ ... })'s to be in the way
3526 * of the quantifier and the EXACT-like node. -- japhy
3527 */
3528
cca55fe3 3529 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3530 U8 *s;
3531 regnode *text_node = next;
3532
cca55fe3 3533 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3534
cca55fe3 3535 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3536 else {
cca55fe3
JP
3537 if (PL_regkind[(U8)OP(text_node)] == REF) {
3538 I32 n, ln;
3539 n = ARG(text_node); /* which paren pair */
3540 ln = PL_regstartp[n];
3541 /* assume yes if we haven't seen CLOSEn */
3542 if (