This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix copy & paste bugs in mX?PUSH macro tests.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
4bb101f2 70 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b5f8cc5c 71 **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
a687059c 72 ****
9ef589d8
LW
73 **** You may distribute under the terms of either the GNU General Public
74 **** License or the Artistic License, as specified in the README file.
a687059c
LW
75 *
76 * Beware that some of this code is subtly aware of the way operator
77 * precedence is structured in regular expressions. Serious changes in
78 * regular-expression syntax might require a total rethink.
79 */
80#include "EXTERN.h"
864dbfa3 81#define PERL_IN_REGEXEC_C
a687059c 82#include "perl.h"
0f5d15d6 83
a687059c
LW
84#include "regcomp.h"
85
c277df42
IZ
86#define RF_tainted 1 /* tainted information used? */
87#define RF_warned 2 /* warned about big count? */
ce862d02 88#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3 89#define RF_utf8 8 /* String contains multibyte chars? */
c2b0868c 90#define RF_false 16 /* odd number of nested negatives */
a0ed51b3 91
eb160463 92#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
93
94#define RS_init 1 /* eval environment created */
95#define RS_set 2 /* replsv value is set */
c277df42 96
a687059c
LW
97#ifndef STATIC
98#define STATIC static
99#endif
100
ba7b4546 101#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
33b8afdf 107#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
dfe13c55
GS
110#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
111#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
112#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
113#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
114#define HOPc(pos,off) ((char*)HOP(pos,off))
115#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 116
efb30f32 117#define HOPBACK(pos, off) ( \
e54858b0 118 (PL_reg_match_utf8) \
efb30f32
HS
119 ? reghopmaybe((U8*)pos, -off) \
120 : (pos - off >= PL_bostr) \
121 ? (U8*)(pos - off) \
122 : (U8*)NULL \
123)
124#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
125
1aa99e6b
IH
126#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
127#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
128#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
129#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
130#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
131#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
132
977e4b84 133#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 134
5f80c4cf 135/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
136#define JUMPABLE(rn) ( \
137 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
138 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
139 OP(rn) == PLUS || OP(rn) == MINMOD || \
140 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
141)
142
cca55fe3
JP
143#define HAS_TEXT(rn) ( \
144 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
145)
e2d8ce26 146
a84d97b6
HS
147/*
148 Search for mandatory following text node; for lookahead, the text must
149 follow but for lookbehind (rn->flags != 0) we skip to the next step.
150*/
cca55fe3 151#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 152 while (JUMPABLE(rn)) \
a84d97b6 153 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 154 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
155 else if (OP(rn) == PLUS) \
156 rn = NEXTOPER(rn); \
a84d97b6
HS
157 else if (OP(rn) == IFMATCH) \
158 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 159 else rn += NEXT_OFF(rn); \
5f80c4cf 160} STMT_END
74750237 161
acfe0abc 162static void restore_pos(pTHX_ void *arg);
51371543 163
76e3520e 164STATIC CHECKPOINT
cea2e8a9 165S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 166{
3280af22 167 int retval = PL_savestack_ix;
b1ce53c5
JH
168#define REGCP_PAREN_ELEMS 4
169 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
170 int p;
171
e49a9654
IH
172 if (paren_elems_to_push < 0)
173 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
174
a01268b5 175#define REGCP_OTHER_ELEMS 6
4b3c1a47 176 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 177 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 178/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
179 SSPUSHINT(PL_regendp[p]);
180 SSPUSHINT(PL_regstartp[p]);
3280af22 181 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
182 SSPUSHINT(p);
183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
02db2b7b 199# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
02db2b7b 203# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
cea2e8a9 209S_regcppop(pTHX)
a0d0e21e 210{
b1ce53c5 211 I32 i;
a0d0e21e
LW
212 U32 paren = 0;
213 char *input;
cf93c79d 214 I32 tmps;
b1ce53c5
JH
215
216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 217 i = SSPOPINT;
b1ce53c5
JH
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 220 input = (char *) SSPOPPTR;
a01268b5 221 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
b1ce53c5
JH
224
225 /* Now restore the parentheses context. */
41123dfd
JH
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 228 paren = (U32)SSPOPINT;
3280af22 229 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
230 PL_regstartp[paren] = SSPOPINT;
231 tmps = SSPOPINT;
3280af22
NIS
232 if (paren <= *PL_reglastparen)
233 PL_regendp[paren] = tmps;
c277df42 234 DEBUG_r(
c3464db5 235 PerlIO_printf(Perl_debug_log,
b900a521 236 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 237 (UV)paren, (IV)PL_regstartp[paren],
b900a521 238 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 239 (IV)PL_regendp[paren],
3280af22 240 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 241 );
a0d0e21e 242 }
c277df42 243 DEBUG_r(
eb160463 244 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 245 PerlIO_printf(Perl_debug_log,
faccc32b
JH
246 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
247 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
248 }
249 );
daf18116 250#if 1
dafc8851
JH
251 /* It would seem that the similar code in regtry()
252 * already takes care of this, and in fact it is in
253 * a better location to since this code can #if 0-ed out
254 * but the code in regtry() is needed or otherwise tests
255 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
256 * (as of patchlevel 7877) will fail. Then again,
257 * this code seems to be necessary or otherwise
258 * building DynaLoader will fail:
259 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
260 * --jhi */
eb160463
GS
261 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
262 if ((I32)paren > PL_regsize)
cf93c79d
IZ
263 PL_regstartp[paren] = -1;
264 PL_regendp[paren] = -1;
a0d0e21e 265 }
dafc8851 266#endif
a0d0e21e
LW
267 return input;
268}
269
0f5d15d6 270STATIC char *
cea2e8a9 271S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
272{
273 I32 tmp = PL_savestack_ix;
274
275 PL_savestack_ix = ss;
276 regcppop();
277 PL_savestack_ix = tmp;
942e002e 278 return Nullch;
0f5d15d6
IZ
279}
280
281typedef struct re_cc_state
282{
283 I32 ss;
284 regnode *node;
285 struct re_cc_state *prev;
286 CURCUR *cc;
287 regexp *re;
288} re_cc_state;
289
02db2b7b 290#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 291
29d1e993
HS
292#define TRYPAREN(paren, n, input) { \
293 if (paren) { \
294 if (n) { \
295 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
296 PL_regendp[paren] = input - PL_bostr; \
297 } \
298 else \
299 PL_regendp[paren] = -1; \
300 } \
301 if (regmatch(next)) \
302 sayYES; \
303 if (paren && n) \
304 PL_regendp[paren] = -1; \
305}
306
307
a687059c 308/*
e50aee73 309 * pregexec and friends
a687059c
LW
310 */
311
312/*
c277df42 313 - pregexec - match a regexp against a string
a687059c 314 */
c277df42 315I32
864dbfa3 316Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 317 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
318/* strend: pointer to null at end of string */
319/* strbeg: real beginning of string */
320/* minend: end of match must be >=minend after stringarg. */
321/* nosave: For optimizations. */
322{
323 return
9041c2e3 324 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
325 nosave ? 0 : REXEC_COPY_STR);
326}
0f5d15d6
IZ
327
328STATIC void
cea2e8a9 329S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
330{
331 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
332#ifdef DEBUGGING
333 PL_regprogram = prog->program;
334#endif
335 PL_regnpar = prog->nparens;
9041c2e3
NIS
336 PL_regdata = prog->data;
337 PL_reg_re = prog;
0f5d15d6 338}
22e551b9 339
9041c2e3 340/*
cad2e5aa
JH
341 * Need to implement the following flags for reg_anch:
342 *
343 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
344 * USE_INTUIT_ML
345 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
346 * INTUIT_AUTORITATIVE_ML
347 * INTUIT_ONCE_NOML - Intuit can match in one location only.
348 * INTUIT_ONCE_ML
349 *
350 * Another flag for this function: SECOND_TIME (so that float substrs
351 * with giant delta may be not rechecked).
352 */
353
354/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
355
2c2d71f5 356/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
357 Otherwise, only SvCUR(sv) is used to get strbeg. */
358
359/* XXXX We assume that strpos is strbeg unless sv. */
360
6eb5f6b9
JH
361/* XXXX Some places assume that there is a fixed substring.
362 An update may be needed if optimizer marks as "INTUITable"
363 RExen without fixed substrings. Similarly, it is assumed that
364 lengths of all the strings are no more than minlen, thus they
365 cannot come from lookahead.
366 (Or minlen should take into account lookahead.) */
367
2c2d71f5
JH
368/* A failure to find a constant substring means that there is no need to make
369 an expensive call to REx engine, thus we celebrate a failure. Similarly,
370 finding a substring too deep into the string means that less calls to
30944b6d
IZ
371 regtry() should be needed.
372
373 REx compiler's optimizer found 4 possible hints:
374 a) Anchored substring;
375 b) Fixed substring;
376 c) Whether we are anchored (beginning-of-line or \G);
377 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 378 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
379 string which does not contradict any of them.
380 */
2c2d71f5 381
6eb5f6b9
JH
382/* Most of decisions we do here should have been done at compile time.
383 The nodes of the REx which we used for the search should have been
384 deleted from the finite automaton. */
385
cad2e5aa
JH
386char *
387Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
388 char *strend, U32 flags, re_scream_pos_data *data)
389{
b7953727 390 register I32 start_shift = 0;
cad2e5aa 391 /* Should be nonnegative! */
b7953727 392 register I32 end_shift = 0;
2c2d71f5
JH
393 register char *s;
394 register SV *check;
a1933d95 395 char *strbeg;
cad2e5aa 396 char *t;
33b8afdf 397 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 398 I32 ml_anch;
6eb5f6b9 399 register char *other_last = Nullch; /* other substr checked before this */
b7953727 400 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
401#ifdef DEBUGGING
402 char *i_strpos = strpos;
ce333219 403 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 404#endif
a30b2f1f 405 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 406
b8d68ded
JH
407 if (prog->reganch & ROPT_UTF8) {
408 DEBUG_r(PerlIO_printf(Perl_debug_log,
409 "UTF-8 regex...\n"));
410 PL_reg_flags |= RF_utf8;
411 }
412
2a782b5b 413 DEBUG_r({
b8d68ded 414 char *s = PL_reg_match_utf8 ?
c728cb41
JH
415 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
416 strpos;
b8d68ded
JH
417 int len = PL_reg_match_utf8 ?
418 strlen(s) : strend - strpos;
2a782b5b
JH
419 if (!PL_colorset)
420 reginitcolors();
b8d68ded
JH
421 if (PL_reg_match_utf8)
422 DEBUG_r(PerlIO_printf(Perl_debug_log,
423 "UTF-8 target...\n"));
2a782b5b
JH
424 PerlIO_printf(Perl_debug_log,
425 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
426 PL_colors[4],PL_colors[5],PL_colors[0],
427 prog->precomp,
428 PL_colors[1],
429 (strlen(prog->precomp) > 60 ? "..." : ""),
430 PL_colors[0],
431 (int)(len > 60 ? 60 : len),
432 s, PL_colors[1],
433 (len > 60 ? "..." : "")
434 );
435 });
cad2e5aa 436
c344f387
JH
437 /* CHR_DIST() would be more correct here but it makes things slow. */
438 if (prog->minlen > strend - strpos) {
a72c7584
JH
439 DEBUG_r(PerlIO_printf(Perl_debug_log,
440 "String too short... [re_intuit_start]\n"));
cad2e5aa 441 goto fail;
2c2d71f5 442 }
a1933d95 443 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 444 PL_regeol = strend;
33b8afdf
JH
445 if (do_utf8) {
446 if (!prog->check_utf8 && prog->check_substr)
447 to_utf8_substr(prog);
448 check = prog->check_utf8;
449 } else {
450 if (!prog->check_substr && prog->check_utf8)
451 to_byte_substr(prog);
452 check = prog->check_substr;
453 }
454 if (check == &PL_sv_undef) {
455 DEBUG_r(PerlIO_printf(Perl_debug_log,
456 "Non-utf string cannot match utf check string\n"));
457 goto fail;
458 }
2c2d71f5 459 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
460 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
461 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 462 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 463
7e25d62c
JH
464 if (!ml_anch) {
465 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
466 | ROPT_IMPLICIT)) /* not a real BOL */
467 /* SvCUR is not set on references: SvRV and SvPVX overlap */
468 && sv && !SvROK(sv)
469 && (strpos != strbeg)) {
470 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
471 goto fail;
472 }
473 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 474 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 475 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
476 I32 slen;
477
1aa99e6b 478 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
479 if (SvTAIL(check)) {
480 slen = SvCUR(check); /* >= 1 */
cad2e5aa 481
9041c2e3 482 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
483 || (strend - s == slen && strend[-1] != '\n')) {
484 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
485 goto fail_finish;
cad2e5aa
JH
486 }
487 /* Now should match s[0..slen-2] */
488 slen--;
653099ff 489 if (slen && (*SvPVX(check) != *s
cad2e5aa 490 || (slen > 1
653099ff 491 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
492 report_neq:
493 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
494 goto fail_finish;
495 }
cad2e5aa 496 }
653099ff
GS
497 else if (*SvPVX(check) != *s
498 || ((slen = SvCUR(check)) > 1
499 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
500 goto report_neq;
501 goto success_at_start;
7e25d62c 502 }
cad2e5aa 503 }
2c2d71f5 504 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 505 s = strpos;
2c2d71f5 506 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 507 end_shift = prog->minlen - start_shift -
653099ff 508 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 509 if (!ml_anch) {
653099ff
GS
510 I32 end = prog->check_offset_max + CHR_SVLEN(check)
511 - (SvTAIL(check) != 0);
1aa99e6b 512 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
513
514 if (end_shift < eshift)
515 end_shift = eshift;
516 }
cad2e5aa 517 }
2c2d71f5 518 else { /* Can match at random position */
cad2e5aa
JH
519 ml_anch = 0;
520 s = strpos;
2c2d71f5
JH
521 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
522 /* Should be nonnegative! */
523 end_shift = prog->minlen - start_shift -
653099ff 524 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
525 }
526
2c2d71f5 527#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 528 if (end_shift < 0)
6bbae5e6 529 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
530#endif
531
2c2d71f5
JH
532 restart:
533 /* Find a possible match in the region s..strend by looking for
534 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 535 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
536 I32 p = -1; /* Internal iterator of scream. */
537 I32 *pp = data ? data->scream_pos : &p;
538
2c2d71f5
JH
539 if (PL_screamfirst[BmRARE(check)] >= 0
540 || ( BmRARE(check) == '\n'
541 && (BmPREVIOUS(check) == SvCUR(check) - 1)
542 && SvTAIL(check) ))
9041c2e3 543 s = screaminstr(sv, check,
2c2d71f5 544 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 545 else
2c2d71f5 546 goto fail_finish;
4addbd3b
HS
547 /* we may be pointing at the wrong string */
548 if (s && RX_MATCH_COPIED(prog))
7ef91622 549 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
550 if (data)
551 *data->scream_olds = s;
552 }
f33976b4 553 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
554 s = fbm_instr((U8*)(s + start_shift),
555 (U8*)(strend - end_shift),
556 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 557 else
1aa99e6b
IH
558 s = fbm_instr(HOP3(s, start_shift, strend),
559 HOP3(strend, -end_shift, strbeg),
2c2d71f5 560 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
561
562 /* Update the count-of-usability, remove useless subpatterns,
563 unshift s. */
2c2d71f5
JH
564
565 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
566 (s ? "Found" : "Did not find"),
33b8afdf 567 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 568 PL_colors[0],
7b0972df
JH
569 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
570 SvPVX(check),
2c2d71f5
JH
571 PL_colors[1], (SvTAIL(check) ? "$" : ""),
572 (s ? " at offset " : "...\n") ) );
573
574 if (!s)
575 goto fail_finish;
576
6eb5f6b9
JH
577 check_at = s;
578
2c2d71f5 579 /* Finish the diagnostic message */
30944b6d 580 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
581
582 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
583 Start with the other substr.
584 XXXX no SCREAM optimization yet - and a very coarse implementation
585 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
586 *always* match. Probably should be marked during compile...
587 Probably it is right to do no SCREAM here...
588 */
589
33b8afdf 590 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 591 /* Take into account the "other" substring. */
2c2d71f5
JH
592 /* XXXX May be hopelessly wrong for UTF... */
593 if (!other_last)
6eb5f6b9 594 other_last = strpos;
33b8afdf 595 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
596 do_other_anchored:
597 {
1aa99e6b 598 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 599 char *s1 = s;
33b8afdf 600 SV* must;
2c2d71f5 601
2c2d71f5
JH
602 t = s - prog->check_offset_max;
603 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 604 && (!do_utf8
1aa99e6b 605 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 606 && t > strpos)))
30944b6d 607 /* EMPTY */;
2c2d71f5
JH
608 else
609 t = strpos;
1aa99e6b 610 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
611 if (t < other_last) /* These positions already checked */
612 t = other_last;
1aa99e6b 613 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
614 if (last < last1)
615 last1 = last;
616 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
617 /* On end-of-str: see comment below. */
33b8afdf
JH
618 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
619 if (must == &PL_sv_undef) {
620 s = (char*)NULL;
621 DEBUG_r(must = prog->anchored_utf8); /* for debug */
622 }
623 else
624 s = fbm_instr(
625 (unsigned char*)t,
626 HOP3(HOP3(last1, prog->anchored_offset, strend)
627 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
628 must,
629 PL_multiline ? FBMrf_MULTILINE : 0
630 );
1aa99e6b
IH
631 DEBUG_r(PerlIO_printf(Perl_debug_log,
632 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
633 (s ? "Found" : "Contradicts"),
634 PL_colors[0],
33b8afdf
JH
635 (int)(SvCUR(must)
636 - (SvTAIL(must)!=0)),
637 SvPVX(must),
638 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
639 if (!s) {
640 if (last1 >= last2) {
641 DEBUG_r(PerlIO_printf(Perl_debug_log,
642 ", giving up...\n"));
643 goto fail_finish;
644 }
645 DEBUG_r(PerlIO_printf(Perl_debug_log,
646 ", trying floating at offset %ld...\n",
1aa99e6b
IH
647 (long)(HOP3c(s1, 1, strend) - i_strpos)));
648 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
649 s = HOP3c(last, 1, strend);
2c2d71f5
JH
650 goto restart;
651 }
652 else {
653 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 654 (long)(s - i_strpos)));
1aa99e6b
IH
655 t = HOP3c(s, -prog->anchored_offset, strbeg);
656 other_last = HOP3c(s, 1, strend);
30944b6d 657 s = s1;
2c2d71f5
JH
658 if (t == strpos)
659 goto try_at_start;
2c2d71f5
JH
660 goto try_at_offset;
661 }
30944b6d 662 }
2c2d71f5
JH
663 }
664 else { /* Take into account the floating substring. */
33b8afdf
JH
665 char *last, *last1;
666 char *s1 = s;
667 SV* must;
668
669 t = HOP3c(s, -start_shift, strbeg);
670 last1 = last =
671 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
672 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
673 last = HOP3c(t, prog->float_max_offset, strend);
674 s = HOP3c(t, prog->float_min_offset, strend);
675 if (s < other_last)
676 s = other_last;
2c2d71f5 677 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
678 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
679 /* fbm_instr() takes into account exact value of end-of-str
680 if the check is SvTAIL(ed). Since false positives are OK,
681 and end-of-str is not later than strend we are OK. */
682 if (must == &PL_sv_undef) {
683 s = (char*)NULL;
684 DEBUG_r(must = prog->float_utf8); /* for debug message */
685 }
686 else
2c2d71f5 687 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
688 (unsigned char*)last + SvCUR(must)
689 - (SvTAIL(must)!=0),
690 must, PL_multiline ? FBMrf_MULTILINE : 0);
691 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
692 (s ? "Found" : "Contradicts"),
693 PL_colors[0],
694 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
695 SvPVX(must),
696 PL_colors[1], (SvTAIL(must) ? "$" : "")));
697 if (!s) {
698 if (last1 == last) {
2c2d71f5 699 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
700 ", giving up...\n"));
701 goto fail_finish;
2c2d71f5 702 }
33b8afdf
JH
703 DEBUG_r(PerlIO_printf(Perl_debug_log,
704 ", trying anchored starting at offset %ld...\n",
705 (long)(s1 + 1 - i_strpos)));
706 other_last = last;
707 s = HOP3c(t, 1, strend);
708 goto restart;
709 }
710 else {
711 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
712 (long)(s - i_strpos)));
713 other_last = s; /* Fix this later. --Hugo */
714 s = s1;
715 if (t == strpos)
716 goto try_at_start;
717 goto try_at_offset;
718 }
2c2d71f5 719 }
cad2e5aa 720 }
2c2d71f5
JH
721
722 t = s - prog->check_offset_max;
2c2d71f5 723 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 724 && (!do_utf8
1aa99e6b
IH
725 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
726 && t > strpos))) {
2c2d71f5
JH
727 /* Fixed substring is found far enough so that the match
728 cannot start at strpos. */
729 try_at_offset:
cad2e5aa 730 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
731 /* Eventually fbm_*() should handle this, but often
732 anchored_offset is not 0, so this check will not be wasted. */
733 /* XXXX In the code below we prefer to look for "^" even in
734 presence of anchored substrings. And we search even
735 beyond the found float position. These pessimizations
736 are historical artefacts only. */
737 find_anchor:
2c2d71f5 738 while (t < strend - prog->minlen) {
cad2e5aa 739 if (*t == '\n') {
4ee3650e 740 if (t < check_at - prog->check_offset_min) {
33b8afdf 741 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
742 /* Since we moved from the found position,
743 we definitely contradict the found anchored
30944b6d
IZ
744 substr. Due to the above check we do not
745 contradict "check" substr.
746 Thus we can arrive here only if check substr
747 is float. Redo checking for "other"=="fixed".
748 */
9041c2e3 749 strpos = t + 1;
30944b6d
IZ
750 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
751 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
752 goto do_other_anchored;
753 }
4ee3650e
GS
754 /* We don't contradict the found floating substring. */
755 /* XXXX Why not check for STCLASS? */
cad2e5aa 756 s = t + 1;
2c2d71f5 757 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 758 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
759 goto set_useful;
760 }
4ee3650e
GS
761 /* Position contradicts check-string */
762 /* XXXX probably better to look for check-string
763 than for "\n", so one should lower the limit for t? */
764 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 765 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 766 other_last = strpos = s = t + 1;
cad2e5aa
JH
767 goto restart;
768 }
769 t++;
770 }
2c2d71f5
JH
771 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
772 PL_colors[0],PL_colors[1]));
773 goto fail_finish;
cad2e5aa 774 }
f5952150
GS
775 else {
776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
777 PL_colors[0],PL_colors[1]));
778 }
cad2e5aa
JH
779 s = t;
780 set_useful:
33b8afdf 781 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
782 }
783 else {
f5952150 784 /* The found string does not prohibit matching at strpos,
2c2d71f5 785 - no optimization of calling REx engine can be performed,
f5952150
GS
786 unless it was an MBOL and we are not after MBOL,
787 or a future STCLASS check will fail this. */
2c2d71f5
JH
788 try_at_start:
789 /* Even in this situation we may use MBOL flag if strpos is offset
790 wrt the start of the string. */
05b4157f 791 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 792 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
793 /* May be due to an implicit anchor of m{.*foo} */
794 && !(prog->reganch & ROPT_IMPLICIT))
795 {
cad2e5aa
JH
796 t = strpos;
797 goto find_anchor;
798 }
30944b6d 799 DEBUG_r( if (ml_anch)
f5952150
GS
800 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
801 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 802 );
2c2d71f5 803 success_at_start:
30944b6d 804 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
805 && (do_utf8 ? (
806 prog->check_utf8 /* Could be deleted already */
807 && --BmUSEFUL(prog->check_utf8) < 0
808 && (prog->check_utf8 == prog->float_utf8)
809 ) : (
810 prog->check_substr /* Could be deleted already */
811 && --BmUSEFUL(prog->check_substr) < 0
812 && (prog->check_substr == prog->float_substr)
813 )))
66e933ab 814 {
cad2e5aa 815 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 816 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
817 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
818 if (do_utf8 ? prog->check_substr : prog->check_utf8)
819 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
820 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
821 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 822 check = Nullsv; /* abort */
cad2e5aa 823 s = strpos;
3cf5c195
IZ
824 /* XXXX This is a remnant of the old implementation. It
825 looks wasteful, since now INTUIT can use many
6eb5f6b9 826 other heuristics. */
cad2e5aa
JH
827 prog->reganch &= ~RE_USE_INTUIT;
828 }
829 else
830 s = strpos;
831 }
832
6eb5f6b9
JH
833 /* Last resort... */
834 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
835 if (prog->regstclass) {
836 /* minlen == 0 is possible if regstclass is \b or \B,
837 and the fixed substr is ''$.
838 Since minlen is already taken into account, s+1 is before strend;
839 accidentally, minlen >= 1 guaranties no false positives at s + 1
840 even for \b or \B. But (minlen? 1 : 0) below assumes that
841 regstclass does not come from lookahead... */
842 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
843 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 844 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 845 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 846 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 847 : 1);
33b8afdf 848 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 849 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 850 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
851 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
852 cl_l, strend)
853 : strend);
a1933d95 854 char *startpos = strbeg;
6eb5f6b9
JH
855
856 t = s;
9926ca43 857 cache_re(prog);
f33976b4 858 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
859 if (!s) {
860#ifdef DEBUGGING
b7953727 861 char *what = 0;
6eb5f6b9
JH
862#endif
863 if (endpos == strend) {
864 DEBUG_r( PerlIO_printf(Perl_debug_log,
865 "Could not match STCLASS...\n") );
866 goto fail;
867 }
66e933ab
GS
868 DEBUG_r( PerlIO_printf(Perl_debug_log,
869 "This position contradicts STCLASS...\n") );
653099ff
GS
870 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
871 goto fail;
6eb5f6b9 872 /* Contradict one of substrings */
33b8afdf
JH
873 if (prog->anchored_substr || prog->anchored_utf8) {
874 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
875 DEBUG_r( what = "anchored" );
876 hop_and_restart:
1aa99e6b 877 s = HOP3c(t, 1, strend);
66e933ab
GS
878 if (s + start_shift + end_shift > strend) {
879 /* XXXX Should be taken into account earlier? */
880 DEBUG_r( PerlIO_printf(Perl_debug_log,
881 "Could not match STCLASS...\n") );
882 goto fail;
883 }
5e39e1e5
HS
884 if (!check)
885 goto giveup;
6eb5f6b9 886 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 887 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
888 what, (long)(s + start_shift - i_strpos)) );
889 goto restart;
890 }
66e933ab 891 /* Have both, check_string is floating */
6eb5f6b9
JH
892 if (t + start_shift >= check_at) /* Contradicts floating=check */
893 goto retry_floating_check;
894 /* Recheck anchored substring, but not floating... */
9041c2e3 895 s = check_at;
5e39e1e5
HS
896 if (!check)
897 goto giveup;
6eb5f6b9 898 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 899 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
900 (long)(other_last - i_strpos)) );
901 goto do_other_anchored;
902 }
60e71179
GS
903 /* Another way we could have checked stclass at the
904 current position only: */
905 if (ml_anch) {
906 s = t = t + 1;
5e39e1e5
HS
907 if (!check)
908 goto giveup;
60e71179 909 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
910 "Looking for /%s^%s/m starting at offset %ld...\n",
911 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 912 goto try_at_offset;
66e933ab 913 }
33b8afdf 914 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 915 goto fail;
6eb5f6b9
JH
916 /* Check is floating subtring. */
917 retry_floating_check:
918 t = check_at - start_shift;
919 DEBUG_r( what = "floating" );
920 goto hop_and_restart;
921 }
b7953727
JH
922 if (t != s) {
923 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 924 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
925 (long)(t - i_strpos), (long)(s - i_strpos))
926 );
927 }
928 else {
929 DEBUG_r(PerlIO_printf(Perl_debug_log,
930 "Does not contradict STCLASS...\n");
931 );
932 }
6eb5f6b9 933 }
5e39e1e5
HS
934 giveup:
935 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
936 PL_colors[4], (check ? "Guessed" : "Giving up"),
937 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 938 return s;
2c2d71f5
JH
939
940 fail_finish: /* Substring not found */
33b8afdf
JH
941 if (prog->check_substr || prog->check_utf8) /* could be removed already */
942 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 943 fail:
2c2d71f5 944 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
945 PL_colors[4],PL_colors[5]));
946 return Nullch;
947}
9661b544 948
6eb5f6b9 949/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
950STATIC char *
951S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 952{
6eb5f6b9
JH
953 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
954 char *m;
d8093b23 955 STRLEN ln;
5dab1207 956 STRLEN lnc;
078c425b 957 register STRLEN uskip;
d8093b23
G
958 unsigned int c1;
959 unsigned int c2;
6eb5f6b9
JH
960 char *e;
961 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 962 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 963
6eb5f6b9
JH
964 /* We know what class it must start with. */
965 switch (OP(c)) {
6eb5f6b9 966 case ANYOF:
388cc4de 967 if (do_utf8) {
078c425b 968 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
969 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
970 !UTF8_IS_INVARIANT((U8)s[0]) ?
971 reginclass(c, (U8*)s, 0, do_utf8) :
972 REGINCLASS(c, (U8*)s)) {
973 if (tmp && (norun || regtry(prog, s)))
974 goto got_it;
975 else
976 tmp = doevery;
977 }
978 else
979 tmp = 1;
078c425b 980 s += uskip;
388cc4de
HS
981 }
982 }
983 else {
984 while (s < strend) {
985 STRLEN skip = 1;
986
987 if (REGINCLASS(c, (U8*)s) ||
988 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
989 /* The assignment of 2 is intentional:
990 * for the folded sharp s, the skip is 2. */
991 (skip = SHARP_S_SKIP))) {
992 if (tmp && (norun || regtry(prog, s)))
993 goto got_it;
994 else
995 tmp = doevery;
996 }
997 else
998 tmp = 1;
999 s += skip;
1000 }
a0d0e21e 1001 }
6eb5f6b9 1002 break;
f33976b4
DB
1003 case CANY:
1004 while (s < strend) {
1005 if (tmp && (norun || regtry(prog, s)))
1006 goto got_it;
1007 else
1008 tmp = doevery;
1009 s++;
1010 }
1011 break;
6eb5f6b9 1012 case EXACTF:
5dab1207
NIS
1013 m = STRING(c);
1014 ln = STR_LEN(c); /* length to match in octets/bytes */
1015 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1016 if (UTF) {
a2a2844f 1017 STRLEN ulen1, ulen2;
5dab1207 1018 U8 *sm = (U8 *) m;
e7ae6809
JH
1019 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1020 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1021
1022 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1023 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1024
872c91ae
JH
1025 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1026 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1027 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1028 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1029 lnc = 0;
1030 while (sm < ((U8 *) m + ln)) {
1031 lnc++;
1032 sm += UTF8SKIP(sm);
1033 }
1aa99e6b
IH
1034 }
1035 else {
1036 c1 = *(U8*)m;
1037 c2 = PL_fold[c1];
1038 }
6eb5f6b9
JH
1039 goto do_exactf;
1040 case EXACTFL:
5dab1207
NIS
1041 m = STRING(c);
1042 ln = STR_LEN(c);
1043 lnc = (I32) ln;
d8093b23 1044 c1 = *(U8*)m;
6eb5f6b9
JH
1045 c2 = PL_fold_locale[c1];
1046 do_exactf:
db12adc6 1047 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1048
6eb5f6b9
JH
1049 if (norun && e < s)
1050 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1051
60a8b682
JH
1052 /* The idea in the EXACTF* cases is to first find the
1053 * first character of the EXACTF* node and then, if
1054 * necessary, case-insensitively compare the full
1055 * text of the node. The c1 and c2 are the first
1056 * characters (though in Unicode it gets a bit
1057 * more complicated because there are more cases
7f16dd3d
JH
1058 * than just upper and lower: one needs to use
1059 * the so-called folding case for case-insensitive
1060 * matching (called "loose matching" in Unicode).
1061 * ibcmp_utf8() will do just that. */
60a8b682 1062
1aa99e6b 1063 if (do_utf8) {
575cac57
JH
1064 UV c, f;
1065 U8 tmpbuf [UTF8_MAXLEN+1];
1066 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1067 STRLEN len, foldlen;
d7f013c8 1068
09091399 1069 if (c1 == c2) {
5dab1207
NIS
1070 /* Upper and lower of 1st char are equal -
1071 * probably not a "letter". */
1aa99e6b 1072 while (s <= e) {
872c91ae
JH
1073 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1074 ckWARN(WARN_UTF8) ?
1075 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1076 if ( c == c1
1077 && (ln == len ||
66423254 1078 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1079 m, (char **)0, ln, (bool)UTF))
55da9344 1080 && (norun || regtry(prog, s)) )
1aa99e6b 1081 goto got_it;
80aecb99
JH
1082 else {
1083 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1084 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1085 if ( f != c
1086 && (f == c1 || f == c2)
1087 && (ln == foldlen ||
66423254
JH
1088 !ibcmp_utf8((char *) foldbuf,
1089 (char **)0, foldlen, do_utf8,
d07ddd77 1090 m,
eb160463 1091 (char **)0, ln, (bool)UTF))
80aecb99
JH
1092 && (norun || regtry(prog, s)) )
1093 goto got_it;
1094 }
1aa99e6b
IH
1095 s += len;
1096 }
09091399
JH
1097 }
1098 else {
1aa99e6b 1099 while (s <= e) {
872c91ae
JH
1100 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1101 ckWARN(WARN_UTF8) ?
1102 0 : UTF8_ALLOW_ANY);
80aecb99 1103
60a8b682 1104 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1105 * Note that not all the possible combinations
1106 * are handled here: some of them are handled
1107 * by the standard folding rules, and some of
1108 * them (the character class or ANYOF cases)
1109 * are handled during compiletime in
1110 * regexec.c:S_regclass(). */
880bd946
JH
1111 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1112 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1113 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1114
1115 if ( (c == c1 || c == c2)
1116 && (ln == len ||
66423254 1117 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1118 m, (char **)0, ln, (bool)UTF))
55da9344 1119 && (norun || regtry(prog, s)) )
1aa99e6b 1120 goto got_it;
80aecb99
JH
1121 else {
1122 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1123 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1124 if ( f != c
1125 && (f == c1 || f == c2)
1126 && (ln == foldlen ||
a6872d42 1127 !ibcmp_utf8((char *) foldbuf,
66423254 1128 (char **)0, foldlen, do_utf8,
d07ddd77 1129 m,
eb160463 1130 (char **)0, ln, (bool)UTF))
80aecb99
JH
1131 && (norun || regtry(prog, s)) )
1132 goto got_it;
1133 }
1aa99e6b
IH
1134 s += len;
1135 }
09091399 1136 }
1aa99e6b
IH
1137 }
1138 else {
1139 if (c1 == c2)
1140 while (s <= e) {
1141 if ( *(U8*)s == c1
1142 && (ln == 1 || !(OP(c) == EXACTF
1143 ? ibcmp(s, m, ln)
1144 : ibcmp_locale(s, m, ln)))
1145 && (norun || regtry(prog, s)) )
1146 goto got_it;
1147 s++;
1148 }
1149 else
1150 while (s <= e) {
1151 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1152 && (ln == 1 || !(OP(c) == EXACTF
1153 ? ibcmp(s, m, ln)
1154 : ibcmp_locale(s, m, ln)))
1155 && (norun || regtry(prog, s)) )
1156 goto got_it;
1157 s++;
1158 }
b3c9acc1
IZ
1159 }
1160 break;
bbce6d69 1161 case BOUNDL:
3280af22 1162 PL_reg_flags |= RF_tainted;
bbce6d69 1163 /* FALL THROUGH */
a0d0e21e 1164 case BOUND:
ffc61ed2 1165 if (do_utf8) {
12d33761 1166 if (s == PL_bostr)
ffc61ed2
JH
1167 tmp = '\n';
1168 else {
b4f7163a 1169 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1170
b4f7163a 1171 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1172 }
1173 tmp = ((OP(c) == BOUND ?
9041c2e3 1174 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1175 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1176 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1177 if (tmp == !(OP(c) == BOUND ?
3568d838 1178 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1179 isALNUM_LC_utf8((U8*)s)))
1180 {
1181 tmp = !tmp;
1182 if ((norun || regtry(prog, s)))
1183 goto got_it;
1184 }
078c425b 1185 s += uskip;
a687059c 1186 }
a0d0e21e 1187 }
667bb95a 1188 else {
12d33761 1189 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1190 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1191 while (s < strend) {
1192 if (tmp ==
1193 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1194 tmp = !tmp;
1195 if ((norun || regtry(prog, s)))
1196 goto got_it;
1197 }
1198 s++;
a0ed51b3 1199 }
a0ed51b3 1200 }
6eb5f6b9 1201 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1202 goto got_it;
1203 break;
bbce6d69 1204 case NBOUNDL:
3280af22 1205 PL_reg_flags |= RF_tainted;
bbce6d69 1206 /* FALL THROUGH */
a0d0e21e 1207 case NBOUND:
ffc61ed2 1208 if (do_utf8) {
12d33761 1209 if (s == PL_bostr)
ffc61ed2
JH
1210 tmp = '\n';
1211 else {
b4f7163a 1212 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1213
b4f7163a 1214 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1215 }
1216 tmp = ((OP(c) == NBOUND ?
9041c2e3 1217 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1218 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1219 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1220 if (tmp == !(OP(c) == NBOUND ?
3568d838 1221 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1222 isALNUM_LC_utf8((U8*)s)))
1223 tmp = !tmp;
1224 else if ((norun || regtry(prog, s)))
1225 goto got_it;
078c425b 1226 s += uskip;
ffc61ed2 1227 }
a0d0e21e 1228 }
667bb95a 1229 else {
12d33761 1230 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1231 tmp = ((OP(c) == NBOUND ?
1232 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1233 while (s < strend) {
1234 if (tmp ==
1235 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1236 tmp = !tmp;
1237 else if ((norun || regtry(prog, s)))
1238 goto got_it;
1239 s++;
1240 }
a0ed51b3 1241 }
6eb5f6b9 1242 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1243 goto got_it;
1244 break;
a0d0e21e 1245 case ALNUM:
ffc61ed2 1246 if (do_utf8) {
8269fa76 1247 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1248 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1249 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1250 if (tmp && (norun || regtry(prog, s)))
1251 goto got_it;
1252 else
1253 tmp = doevery;
1254 }
bbce6d69 1255 else
ffc61ed2 1256 tmp = 1;
078c425b 1257 s += uskip;
bbce6d69 1258 }
bbce6d69 1259 }
ffc61ed2
JH
1260 else {
1261 while (s < strend) {
1262 if (isALNUM(*s)) {
1263 if (tmp && (norun || regtry(prog, s)))
1264 goto got_it;
1265 else
1266 tmp = doevery;
1267 }
a0ed51b3 1268 else
ffc61ed2
JH
1269 tmp = 1;
1270 s++;
a0ed51b3 1271 }
a0ed51b3
LW
1272 }
1273 break;
bbce6d69 1274 case ALNUML:
3280af22 1275 PL_reg_flags |= RF_tainted;
ffc61ed2 1276 if (do_utf8) {
078c425b 1277 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1278 if (isALNUM_LC_utf8((U8*)s)) {
1279 if (tmp && (norun || regtry(prog, s)))
1280 goto got_it;
1281 else
1282 tmp = doevery;
1283 }
a687059c 1284 else
ffc61ed2 1285 tmp = 1;
078c425b 1286 s += uskip;
a0d0e21e 1287 }
a0d0e21e 1288 }
ffc61ed2
JH
1289 else {
1290 while (s < strend) {
1291 if (isALNUM_LC(*s)) {
1292 if (tmp && (norun || regtry(prog, s)))
1293 goto got_it;
1294 else
1295 tmp = doevery;
1296 }
a0ed51b3 1297 else
ffc61ed2
JH
1298 tmp = 1;
1299 s++;
a0ed51b3 1300 }
a0ed51b3
LW
1301 }
1302 break;
a0d0e21e 1303 case NALNUM:
ffc61ed2 1304 if (do_utf8) {
8269fa76 1305 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1306 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1307 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1308 if (tmp && (norun || regtry(prog, s)))
1309 goto got_it;
1310 else
1311 tmp = doevery;
1312 }
bbce6d69 1313 else
ffc61ed2 1314 tmp = 1;
078c425b 1315 s += uskip;
bbce6d69 1316 }
bbce6d69 1317 }
ffc61ed2
JH
1318 else {
1319 while (s < strend) {
1320 if (!isALNUM(*s)) {
1321 if (tmp && (norun || regtry(prog, s)))
1322 goto got_it;
1323 else
1324 tmp = doevery;
1325 }
a0ed51b3 1326 else
ffc61ed2
JH
1327 tmp = 1;
1328 s++;
a0ed51b3 1329 }
a0ed51b3
LW
1330 }
1331 break;
bbce6d69 1332 case NALNUML:
3280af22 1333 PL_reg_flags |= RF_tainted;
ffc61ed2 1334 if (do_utf8) {
078c425b 1335 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1336 if (!isALNUM_LC_utf8((U8*)s)) {
1337 if (tmp && (norun || regtry(prog, s)))
1338 goto got_it;
1339 else
1340 tmp = doevery;
1341 }
a687059c 1342 else
ffc61ed2 1343 tmp = 1;
078c425b 1344 s += uskip;
a687059c 1345 }
a0d0e21e 1346 }
ffc61ed2
JH
1347 else {
1348 while (s < strend) {
1349 if (!isALNUM_LC(*s)) {
1350 if (tmp && (norun || regtry(prog, s)))
1351 goto got_it;
1352 else
1353 tmp = doevery;
1354 }
a0ed51b3 1355 else
ffc61ed2
JH
1356 tmp = 1;
1357 s++;
a0ed51b3 1358 }
a0ed51b3
LW
1359 }
1360 break;
a0d0e21e 1361 case SPACE:
ffc61ed2 1362 if (do_utf8) {
8269fa76 1363 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1364 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1365 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1366 if (tmp && (norun || regtry(prog, s)))
1367 goto got_it;
1368 else
1369 tmp = doevery;
1370 }
a0d0e21e 1371 else
ffc61ed2 1372 tmp = 1;
078c425b 1373 s += uskip;
2304df62 1374 }
a0d0e21e 1375 }
ffc61ed2
JH
1376 else {
1377 while (s < strend) {
1378 if (isSPACE(*s)) {
1379 if (tmp && (norun || regtry(prog, s)))
1380 goto got_it;
1381 else
1382 tmp = doevery;
1383 }
a0ed51b3 1384 else
ffc61ed2
JH
1385 tmp = 1;
1386 s++;
a0ed51b3 1387 }
a0ed51b3
LW
1388 }
1389 break;
bbce6d69 1390 case SPACEL:
3280af22 1391 PL_reg_flags |= RF_tainted;
ffc61ed2 1392 if (do_utf8) {
078c425b 1393 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1394 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1395 if (tmp && (norun || regtry(prog, s)))
1396 goto got_it;
1397 else
1398 tmp = doevery;
1399 }
bbce6d69 1400 else
ffc61ed2 1401 tmp = 1;
078c425b 1402 s += uskip;
bbce6d69 1403 }
bbce6d69 1404 }
ffc61ed2
JH
1405 else {
1406 while (s < strend) {
1407 if (isSPACE_LC(*s)) {
1408 if (tmp && (norun || regtry(prog, s)))
1409 goto got_it;
1410 else
1411 tmp = doevery;
1412 }
a0ed51b3 1413 else
ffc61ed2
JH
1414 tmp = 1;
1415 s++;
a0ed51b3 1416 }
a0ed51b3
LW
1417 }
1418 break;
a0d0e21e 1419 case NSPACE:
ffc61ed2 1420 if (do_utf8) {
8269fa76 1421 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1422 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1423 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1424 if (tmp && (norun || regtry(prog, s)))
1425 goto got_it;
1426 else
1427 tmp = doevery;
1428 }
a0d0e21e 1429 else
ffc61ed2 1430 tmp = 1;
078c425b 1431 s += uskip;
a687059c 1432 }
a0d0e21e 1433 }
ffc61ed2
JH
1434 else {
1435 while (s < strend) {
1436 if (!isSPACE(*s)) {
1437 if (tmp && (norun || regtry(prog, s)))
1438 goto got_it;
1439 else
1440 tmp = doevery;
1441 }
a0ed51b3 1442 else
ffc61ed2
JH
1443 tmp = 1;
1444 s++;
a0ed51b3 1445 }
a0ed51b3
LW
1446 }
1447 break;
bbce6d69 1448 case NSPACEL:
3280af22 1449 PL_reg_flags |= RF_tainted;
ffc61ed2 1450 if (do_utf8) {
078c425b 1451 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1452 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1453 if (tmp && (norun || regtry(prog, s)))
1454 goto got_it;
1455 else
1456 tmp = doevery;
1457 }
bbce6d69 1458 else
ffc61ed2 1459 tmp = 1;
078c425b 1460 s += uskip;
bbce6d69 1461 }
bbce6d69 1462 }
ffc61ed2
JH
1463 else {
1464 while (s < strend) {
1465 if (!isSPACE_LC(*s)) {
1466 if (tmp && (norun || regtry(prog, s)))
1467 goto got_it;
1468 else
1469 tmp = doevery;
1470 }
a0ed51b3 1471 else
ffc61ed2
JH
1472 tmp = 1;
1473 s++;
a0ed51b3 1474 }
a0ed51b3
LW
1475 }
1476 break;
a0d0e21e 1477 case DIGIT:
ffc61ed2 1478 if (do_utf8) {
8269fa76 1479 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1480 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1481 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1482 if (tmp && (norun || regtry(prog, s)))
1483 goto got_it;
1484 else
1485 tmp = doevery;
1486 }
a0d0e21e 1487 else
ffc61ed2 1488 tmp = 1;
078c425b 1489 s += uskip;
2b69d0c2 1490 }
a0d0e21e 1491 }
ffc61ed2
JH
1492 else {
1493 while (s < strend) {
1494 if (isDIGIT(*s)) {
1495 if (tmp && (norun || regtry(prog, s)))
1496 goto got_it;
1497 else
1498 tmp = doevery;
1499 }
a0ed51b3 1500 else
ffc61ed2
JH
1501 tmp = 1;
1502 s++;
a0ed51b3 1503 }
a0ed51b3
LW
1504 }
1505 break;
b8c5462f
JH
1506 case DIGITL:
1507 PL_reg_flags |= RF_tainted;
ffc61ed2 1508 if (do_utf8) {
078c425b 1509 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1510 if (isDIGIT_LC_utf8((U8*)s)) {
1511 if (tmp && (norun || regtry(prog, s)))
1512 goto got_it;
1513 else
1514 tmp = doevery;
1515 }
b8c5462f 1516 else
ffc61ed2 1517 tmp = 1;
078c425b 1518 s += uskip;
b8c5462f 1519 }
b8c5462f 1520 }
ffc61ed2
JH
1521 else {
1522 while (s < strend) {
1523 if (isDIGIT_LC(*s)) {
1524 if (tmp && (norun || regtry(prog, s)))
1525 goto got_it;
1526 else
1527 tmp = doevery;
1528 }
b8c5462f 1529 else
ffc61ed2
JH
1530 tmp = 1;
1531 s++;
b8c5462f 1532 }
b8c5462f
JH
1533 }
1534 break;
a0d0e21e 1535 case NDIGIT:
ffc61ed2 1536 if (do_utf8) {
8269fa76 1537 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1538 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1539 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1540 if (tmp && (norun || regtry(prog, s)))
1541 goto got_it;
1542 else
1543 tmp = doevery;
1544 }
a0d0e21e 1545 else
ffc61ed2 1546 tmp = 1;
078c425b 1547 s += uskip;
a687059c 1548 }
a0d0e21e 1549 }
ffc61ed2
JH
1550 else {
1551 while (s < strend) {
1552 if (!isDIGIT(*s)) {
1553 if (tmp && (norun || regtry(prog, s)))
1554 goto got_it;
1555 else
1556 tmp = doevery;
1557 }
a0ed51b3 1558 else
ffc61ed2
JH
1559 tmp = 1;
1560 s++;
a0ed51b3 1561 }
a0ed51b3
LW
1562 }
1563 break;
b8c5462f
JH
1564 case NDIGITL:
1565 PL_reg_flags |= RF_tainted;
ffc61ed2 1566 if (do_utf8) {
078c425b 1567 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1568 if (!isDIGIT_LC_utf8((U8*)s)) {
1569 if (tmp && (norun || regtry(prog, s)))
1570 goto got_it;
1571 else
1572 tmp = doevery;
1573 }
b8c5462f 1574 else
ffc61ed2 1575 tmp = 1;
078c425b 1576 s += uskip;
b8c5462f 1577 }
a0ed51b3 1578 }
ffc61ed2
JH
1579 else {
1580 while (s < strend) {
1581 if (!isDIGIT_LC(*s)) {
1582 if (tmp && (norun || regtry(prog, s)))
1583 goto got_it;
1584 else
1585 tmp = doevery;
1586 }
cf93c79d 1587 else
ffc61ed2
JH
1588 tmp = 1;
1589 s++;
b8c5462f 1590 }
b8c5462f
JH
1591 }
1592 break;
b3c9acc1 1593 default:
3c3eec57
GS
1594 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1595 break;
d6a28714 1596 }
6eb5f6b9
JH
1597 return 0;
1598 got_it:
1599 return s;
1600}
1601
1602/*
1603 - regexec_flags - match a regexp against a string
1604 */
1605I32
1606Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1607 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1608/* strend: pointer to null at end of string */
1609/* strbeg: real beginning of string */
1610/* minend: end of match must be >=minend after stringarg. */
1611/* data: May be used for some additional optimizations. */
1612/* nosave: For optimizations. */
1613{
6eb5f6b9
JH
1614 register char *s;
1615 register regnode *c;
1616 register char *startpos = stringarg;
6eb5f6b9
JH
1617 I32 minlen; /* must match at least this many chars */
1618 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1619 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1620 constant substr. */ /* CC */
1621 I32 end_shift = 0; /* Same for the end. */ /* CC */
1622 I32 scream_pos = -1; /* Internal iterator of scream. */
1623 char *scream_olds;
1624 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1625 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1626#ifdef DEBUGGING
9e55ce06
JH
1627 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1628 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1629#endif
a30b2f1f 1630 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1631
1632 PL_regcc = 0;
1633
1634 cache_re(prog);
1635#ifdef DEBUGGING
aea4f609 1636 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1637#endif
1638
1639 /* Be paranoid... */
1640 if (prog == NULL || startpos == NULL) {
1641 Perl_croak(aTHX_ "NULL regexp parameter");
1642 return 0;
1643 }
1644
1645 minlen = prog->minlen;
61a36c01 1646 if (strend - startpos < minlen) {
a72c7584
JH
1647 DEBUG_r(PerlIO_printf(Perl_debug_log,
1648 "String too short [regexec_flags]...\n"));
1649 goto phooey;
1aa99e6b 1650 }
6eb5f6b9 1651
6eb5f6b9
JH
1652 /* Check validity of program. */
1653 if (UCHARAT(prog->program) != REG_MAGIC) {
1654 Perl_croak(aTHX_ "corrupted regexp program");
1655 }
1656
1657 PL_reg_flags = 0;
1658 PL_reg_eval_set = 0;
1659 PL_reg_maxiter = 0;
1660
1661 if (prog->reganch & ROPT_UTF8)
1662 PL_reg_flags |= RF_utf8;
1663
1664 /* Mark beginning of line for ^ and lookbehind. */
1665 PL_regbol = startpos;
1666 PL_bostr = strbeg;
1667 PL_reg_sv = sv;
1668
1669 /* Mark end of line for $ (and such) */
1670 PL_regeol = strend;
1671
1672 /* see how far we have to get to not match where we matched before */
1673 PL_regtill = startpos+minend;
1674
1675 /* We start without call_cc context. */
1676 PL_reg_call_cc = 0;
1677
1678 /* If there is a "must appear" string, look for it. */
1679 s = startpos;
1680
1681 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1682 MAGIC *mg;
1683
1684 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1685 PL_reg_ganch = startpos;
1686 else if (sv && SvTYPE(sv) >= SVt_PVMG
1687 && SvMAGIC(sv)
14befaf4
DM
1688 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1689 && mg->mg_len >= 0) {
6eb5f6b9
JH
1690 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1691 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1692 if (s > PL_reg_ganch)
6eb5f6b9
JH
1693 goto phooey;
1694 s = PL_reg_ganch;
1695 }
1696 }
1697 else /* pos() not defined */
1698 PL_reg_ganch = strbeg;
1699 }
1700
33b8afdf 1701 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1702 re_scream_pos_data d;
1703
1704 d.scream_olds = &scream_olds;
1705 d.scream_pos = &scream_pos;
1706 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1707 if (!s) {
1708 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1709 goto phooey; /* not present */
3fa9c3d7 1710 }
6eb5f6b9
JH
1711 }
1712
2a782b5b 1713 DEBUG_r({
9e55ce06
JH
1714 char *s0 = UTF ?
1715 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1716 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1717 prog->precomp;
1718 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1719 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1720 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1721 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1722 if (!PL_colorset)
1723 reginitcolors();
1724 PerlIO_printf(Perl_debug_log,
9e55ce06 1725 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1726 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1727 len0, len0, s0,
2a782b5b 1728 PL_colors[1],
9e55ce06 1729 len0 > 60 ? "..." : "",
2a782b5b 1730 PL_colors[0],
9e55ce06
JH
1731 (int)(len1 > 60 ? 60 : len1),
1732 s1, PL_colors[1],
1733 (len1 > 60 ? "..." : "")
2a782b5b
JH
1734 );
1735 });
6eb5f6b9
JH
1736
1737 /* Simplest case: anchored match need be tried only once. */
1738 /* [unless only anchor is BOL and multiline is set] */
1739 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1740 if (s == startpos && regtry(prog, startpos))
1741 goto got_it;
1742 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1743 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1744 {
1745 char *end;
1746
1747 if (minlen)
1748 dontbother = minlen - 1;
1aa99e6b 1749 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1750 /* for multiline we only have to try after newlines */
33b8afdf 1751 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1752 if (s == startpos)
1753 goto after_try;
1754 while (1) {
1755 if (regtry(prog, s))
1756 goto got_it;
1757 after_try:
1758 if (s >= end)
1759 goto phooey;
1760 if (prog->reganch & RE_USE_INTUIT) {
1761 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1762 if (!s)
1763 goto phooey;
1764 }
1765 else
1766 s++;
1767 }
1768 } else {
1769 if (s > startpos)
1770 s--;
1771 while (s < end) {
1772 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1773 if (regtry(prog, s))
1774 goto got_it;
1775 }
1776 }
1777 }
1778 }
1779 goto phooey;
1780 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1781 if (regtry(prog, PL_reg_ganch))
1782 goto got_it;
1783 goto phooey;
1784 }
1785
1786 /* Messy cases: unanchored match. */
33b8afdf 1787 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1788 /* we have /x+whatever/ */
1789 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1790 char ch;
bf93d4cc
GS
1791#ifdef DEBUGGING
1792 int did_match = 0;
1793#endif
33b8afdf
JH
1794 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1795 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1796 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1797
1aa99e6b 1798 if (do_utf8) {
6eb5f6b9
JH
1799 while (s < strend) {
1800 if (*s == ch) {
bf93d4cc 1801 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1802 if (regtry(prog, s)) goto got_it;
1803 s += UTF8SKIP(s);
1804 while (s < strend && *s == ch)
1805 s += UTF8SKIP(s);
1806 }
1807 s += UTF8SKIP(s);
1808 }
1809 }
1810 else {
1811 while (s < strend) {
1812 if (*s == ch) {
bf93d4cc 1813 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1814 if (regtry(prog, s)) goto got_it;
1815 s++;
1816 while (s < strend && *s == ch)
1817 s++;
1818 }
1819 s++;
1820 }
1821 }
b7953727 1822 DEBUG_r(if (!did_match)
bf93d4cc 1823 PerlIO_printf(Perl_debug_log,
b7953727
JH
1824 "Did not find anchored character...\n")
1825 );
6eb5f6b9
JH
1826 }
1827 /*SUPPRESS 560*/
33b8afdf
JH
1828 else if (prog->anchored_substr != Nullsv
1829 || prog->anchored_utf8 != Nullsv
1830 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1831 && prog->float_max_offset < strend - s)) {
1832 SV *must;
1833 I32 back_max;
1834 I32 back_min;
1835 char *last;
6eb5f6b9 1836 char *last1; /* Last position checked before */
bf93d4cc
GS
1837#ifdef DEBUGGING
1838 int did_match = 0;
1839#endif
33b8afdf
JH
1840 if (prog->anchored_substr || prog->anchored_utf8) {
1841 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1842 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1843 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1844 back_max = back_min = prog->anchored_offset;
1845 } else {
1846 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1847 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1848 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1849 back_max = prog->float_max_offset;
1850 back_min = prog->float_min_offset;
1851 }
1852 if (must == &PL_sv_undef)
1853 /* could not downgrade utf8 check substring, so must fail */
1854 goto phooey;
1855
1856 last = HOP3c(strend, /* Cannot start after this */
1857 -(I32)(CHR_SVLEN(must)
1858 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1859
1860 if (s > PL_bostr)
1861 last1 = HOPc(s, -1);
1862 else
1863 last1 = s - 1; /* bogus */
1864
1865 /* XXXX check_substr already used to find `s', can optimize if
1866 check_substr==must. */
1867 scream_pos = -1;
1868 dontbother = end_shift;
1869 strend = HOPc(strend, -dontbother);
1870 while ( (s <= last) &&
9041c2e3 1871 ((flags & REXEC_SCREAM)
1aa99e6b 1872 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1873 end_shift, &scream_pos, 0))
1aa99e6b 1874 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1875 (unsigned char*)strend, must,
6eb5f6b9 1876 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1877 /* we may be pointing at the wrong string */
1878 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1879 s = strbeg + (s - SvPVX(sv));
bf93d4cc 1880 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1881 if (HOPc(s, -back_max) > last1) {
1882 last1 = HOPc(s, -back_min);
1883 s = HOPc(s, -back_max);
1884 }
1885 else {
1886 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1887
1888 last1 = HOPc(s, -back_min);
1889 s = t;
1890 }
1aa99e6b 1891 if (do_utf8) {
6eb5f6b9
JH
1892 while (s <= last1) {
1893 if (regtry(prog, s))
1894 goto got_it;
1895 s += UTF8SKIP(s);
1896 }
1897 }
1898 else {
1899 while (s <= last1) {
1900 if (regtry(prog, s))
1901 goto got_it;
1902 s++;
1903 }
1904 }
1905 }
b7953727
JH
1906 DEBUG_r(if (!did_match)
1907 PerlIO_printf(Perl_debug_log,
1908 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1909 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1910 ? "anchored" : "floating"),
1911 PL_colors[0],
1912 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1913 SvPVX(must),
b7953727
JH
1914 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1915 );
6eb5f6b9
JH
1916 goto phooey;
1917 }
155aba94 1918 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1919 if (minlen) {
1920 I32 op = (U8)OP(prog->regstclass);
66e933ab 1921 /* don't bother with what can't match */
f14c76ed
RGS
1922 if (PL_regkind[op] != EXACT && op != CANY)
1923 strend = HOPc(strend, -(minlen - 1));
1924 }
ffc61ed2
JH
1925 DEBUG_r({
1926 SV *prop = sv_newmortal();
9e55ce06
JH
1927 char *s0;
1928 char *s1;
1929 int len0;
1930 int len1;
1931
ffc61ed2 1932 regprop(prop, c);
9e55ce06
JH
1933 s0 = UTF ?
1934 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1935 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1936 SvPVX(prop);
1937 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1938 s1 = UTF ?
c728cb41 1939 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1940 len1 = UTF ? SvCUR(dsv1) : strend - s;
1941 PerlIO_printf(Perl_debug_log,
1942 "Matching stclass `%*.*s' against `%*.*s'\n",
1943 len0, len0, s0,
1944 len1, len1, s1);
ffc61ed2 1945 });
6eb5f6b9
JH
1946 if (find_byclass(prog, c, s, strend, startpos, 0))
1947 goto got_it;
bf93d4cc 1948 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1949 }
1950 else {
1951 dontbother = 0;
33b8afdf
JH
1952 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1953 /* Trim the end. */
d6a28714 1954 char *last;
33b8afdf
JH
1955 SV* float_real;
1956
1957 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1958 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1959 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1960
1961 if (flags & REXEC_SCREAM) {
33b8afdf 1962 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1963 end_shift, &scream_pos, 1); /* last one */
1964 if (!last)
ffc61ed2 1965 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1966 /* we may be pointing at the wrong string */
1967 else if (RX_MATCH_COPIED(prog))
7ef91622 1968 s = strbeg + (s - SvPVX(sv));
b8c5462f 1969 }
d6a28714
JH
1970 else {
1971 STRLEN len;
33b8afdf 1972 char *little = SvPV(float_real, len);
d6a28714 1973
33b8afdf 1974 if (SvTAIL(float_real)) {
d6a28714
JH
1975 if (memEQ(strend - len + 1, little, len - 1))
1976 last = strend - len + 1;
1977 else if (!PL_multiline)
9041c2e3 1978 last = memEQ(strend - len, little, len)
d6a28714 1979 ? strend - len : Nullch;
b8c5462f 1980 else
d6a28714
JH
1981 goto find_last;
1982 } else {
1983 find_last:
9041c2e3 1984 if (len)
d6a28714 1985 last = rninstr(s, strend, little, little + len);
b8c5462f 1986 else
d6a28714 1987 last = strend; /* matching `$' */
b8c5462f 1988 }
b8c5462f 1989 }
bf93d4cc
GS
1990 if (last == NULL) {
1991 DEBUG_r(PerlIO_printf(Perl_debug_log,
1992 "%sCan't trim the tail, match fails (should not happen)%s\n",
1993 PL_colors[4],PL_colors[5]));
1994 goto phooey; /* Should not happen! */
1995 }
d6a28714
JH
1996 dontbother = strend - last + prog->float_min_offset;
1997 }
1998 if (minlen && (dontbother < minlen))
1999 dontbother = minlen - 1;
2000 strend -= dontbother; /* this one's always in bytes! */
2001 /* We don't know much -- general case. */
1aa99e6b 2002 if (do_utf8) {
d6a28714
JH
2003 for (;;) {
2004 if (regtry(prog, s))
2005 goto got_it;
2006 if (s >= strend)
2007 break;
b8c5462f 2008 s += UTF8SKIP(s);
d6a28714
JH
2009 };
2010 }
2011 else {
2012 do {
2013 if (regtry(prog, s))
2014 goto got_it;
2015 } while (s++ < strend);
2016 }
2017 }
2018
2019 /* Failure. */
2020 goto phooey;
2021
2022got_it:
2023 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2024
2025 if (PL_reg_eval_set) {
2026 /* Preserve the current value of $^R */
2027 if (oreplsv != GvSV(PL_replgv))
2028 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2029 restored, the value remains
2030 the same. */
acfe0abc 2031 restore_pos(aTHX_ 0);
d6a28714
JH
2032 }
2033
2034 /* make sure $`, $&, $', and $digit will work later */
2035 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2036 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2037 if (flags & REXEC_COPY_STR) {
2038 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2039#ifdef PERL_COPY_ON_WRITE
2040 if ((SvIsCOW(sv)
2041 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2042 if (DEBUG_C_TEST) {
2043 PerlIO_printf(Perl_debug_log,
2044 "Copy on write: regexp capture, type %d\n",
2045 (int) SvTYPE(sv));
2046 }
2047 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2048 prog->subbeg = SvPVX(prog->saved_copy);
2049 assert (SvPOKp(prog->saved_copy));
2050 } else
2051#endif
2052 {
2053 RX_MATCH_COPIED_on(prog);
2054 s = savepvn(strbeg, i);
2055 prog->subbeg = s;
2056 }
d6a28714 2057 prog->sublen = i;
d6a28714
JH
2058 }
2059 else {
2060 prog->subbeg = strbeg;
2061 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2062 }
2063 }
9041c2e3 2064
d6a28714
JH
2065 return 1;
2066
2067phooey:
bf93d4cc
GS
2068 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2069 PL_colors[4],PL_colors[5]));
d6a28714 2070 if (PL_reg_eval_set)
acfe0abc 2071 restore_pos(aTHX_ 0);
d6a28714
JH
2072 return 0;
2073}
2074
2075/*
2076 - regtry - try match at specific point
2077 */
2078STATIC I32 /* 0 failure, 1 success */
2079S_regtry(pTHX_ regexp *prog, char *startpos)
2080{
d6a28714
JH
2081 register I32 i;
2082 register I32 *sp;
2083 register I32 *ep;
2084 CHECKPOINT lastcp;
2085
02db2b7b
IZ
2086#ifdef DEBUGGING
2087 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2088#endif
d6a28714
JH
2089 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2090 MAGIC *mg;
2091
2092 PL_reg_eval_set = RS_init;
2093 DEBUG_r(DEBUG_s(
b900a521
JH
2094 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2095 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2096 ));
e8347627 2097 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2098 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2099 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2100 SAVETMPS;
2101 /* Apparently this is not needed, judging by wantarray. */
e8347627 2102 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2103 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2104
2105 if (PL_reg_sv) {
2106 /* Make $_ available to executed code. */
2107 if (PL_reg_sv != DEFSV) {
59f00321 2108 SAVE_DEFSV;
d6a28714 2109 DEFSV = PL_reg_sv;
b8c5462f 2110 }
d6a28714 2111
9041c2e3 2112 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2113 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2114 /* prepare for quick setting of pos */
14befaf4
DM
2115 sv_magic(PL_reg_sv, (SV*)0,
2116 PERL_MAGIC_regex_global, Nullch, 0);
2117 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2118 mg->mg_len = -1;
b8c5462f 2119 }
d6a28714
JH
2120 PL_reg_magic = mg;
2121 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2122 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2123 }
09687e5a 2124 if (!PL_reg_curpm) {
0f79a09d 2125 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2126#ifdef USE_ITHREADS
2127 {
2128 SV* repointer = newSViv(0);
577e12cc 2129 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2130 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2131 av_push(PL_regex_padav,repointer);
2132 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2133 PL_regex_pad = AvARRAY(PL_regex_padav);
2134 }
2135#endif
2136 }
aaa362c4 2137 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2138 PL_reg_oldcurpm = PL_curpm;
2139 PL_curpm = PL_reg_curpm;
2140 if (RX_MATCH_COPIED(prog)) {
2141 /* Here is a serious problem: we cannot rewrite subbeg,
2142 since it may be needed if this match fails. Thus
2143 $` inside (?{}) could fail... */
2144 PL_reg_oldsaved = prog->subbeg;
2145 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2146#ifdef PERL_COPY_ON_WRITE
2147 PL_nrs = prog->saved_copy;
2148#endif
d6a28714
JH
2149 RX_MATCH_COPIED_off(prog);
2150 }
2151 else
2152 PL_reg_oldsaved = Nullch;
2153 prog->subbeg = PL_bostr;
2154 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2155 }
2156 prog->startp[0] = startpos - PL_bostr;
2157 PL_reginput = startpos;
2158 PL_regstartp = prog->startp;
2159 PL_regendp = prog->endp;
2160 PL_reglastparen = &prog->lastparen;
a01268b5 2161 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2162 prog->lastparen = 0;
03994de8 2163 prog->lastcloseparen = 0;
d6a28714
JH
2164 PL_regsize = 0;
2165 DEBUG_r(PL_reg_starttry = startpos);
2166 if (PL_reg_start_tmpl <= prog->nparens) {
2167 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2168 if(PL_reg_start_tmp)
2169 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2170 else
2171 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2172 }
2173
2174 /* XXXX What this code is doing here?!!! There should be no need
2175 to do this again and again, PL_reglastparen should take care of
3dd2943c 2176 this! --ilya*/
dafc8851
JH
2177
2178 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2179 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2180 * PL_reglastparen), is not needed at all by the test suite
2181 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2182 * enough, for building DynaLoader, or otherwise this
2183 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2184 * will happen. Meanwhile, this code *is* needed for the
2185 * above-mentioned test suite tests to succeed. The common theme
2186 * on those tests seems to be returning null fields from matches.
2187 * --jhi */
dafc8851 2188#if 1
d6a28714
JH
2189 sp = prog->startp;
2190 ep = prog->endp;
2191 if (prog->nparens) {
eb160463 2192 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2193 *++sp = -1;
2194 *++ep = -1;
2195 }
2196 }
dafc8851 2197#endif
02db2b7b 2198 REGCP_SET(lastcp);
d6a28714
JH
2199 if (regmatch(prog->program + 1)) {
2200 prog->endp[0] = PL_reginput - PL_bostr;
2201 return 1;
2202 }
02db2b7b 2203 REGCP_UNWIND(lastcp);
d6a28714
JH
2204 return 0;
2205}
2206
02db2b7b
IZ
2207#define RE_UNWIND_BRANCH 1
2208#define RE_UNWIND_BRANCHJ 2
2209
2210union re_unwind_t;
2211
2212typedef struct { /* XX: makes sense to enlarge it... */
2213 I32 type;
2214 I32 prev;
2215 CHECKPOINT lastcp;
2216} re_unwind_generic_t;
2217
2218typedef struct {
2219 I32 type;
2220 I32 prev;
2221 CHECKPOINT lastcp;
2222 I32 lastparen;
2223 regnode *next;
2224 char *locinput;
2225 I32 nextchr;
2226#ifdef DEBUGGING
2227 int regindent;
2228#endif
2229} re_unwind_branch_t;
2230
2231typedef union re_unwind_t {
2232 I32 type;
2233 re_unwind_generic_t generic;
2234 re_unwind_branch_t branch;
2235} re_unwind_t;
2236
8ba1375e
MJD
2237#define sayYES goto yes
2238#define sayNO goto no
e0f9d4a8 2239#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2240#define sayYES_FINAL goto yes_final
2241#define sayYES_LOUD goto yes_loud
2242#define sayNO_FINAL goto no_final
2243#define sayNO_SILENT goto do_no
2244#define saySAME(x) if (x) goto yes; else goto no
2245
2246#define REPORT_CODE_OFF 24
2247
d6a28714
JH
2248/*
2249 - regmatch - main matching routine
2250 *
2251 * Conceptually the strategy is simple: check to see whether the current
2252 * node matches, call self recursively to see whether the rest matches,
2253 * and then act accordingly. In practice we make some effort to avoid
2254 * recursion, in particular by going through "ordinary" nodes (that don't
2255 * need to know whether the rest of the match failed) by a loop instead of
2256 * by recursion.
2257 */
2258/* [lwall] I've hoisted the register declarations to the outer block in order to
2259 * maybe save a little bit of pushing and popping on the stack. It also takes
2260 * advantage of machines that use a register save mask on subroutine entry.
2261 */
2262STATIC I32 /* 0 failure, 1 success */
2263S_regmatch(pTHX_ regnode *prog)
2264{
d6a28714
JH
2265 register regnode *scan; /* Current node. */
2266 regnode *next; /* Next node. */
2267 regnode *inner; /* Next node in internal branch. */
2268 register I32 nextchr; /* renamed nextchr - nextchar colides with
2269 function of same name */
2270 register I32 n; /* no or next */
b7953727
JH
2271 register I32 ln = 0; /* len or last */
2272 register char *s = Nullch; /* operand or save */
d6a28714 2273 register char *locinput = PL_reginput;
b7953727 2274 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2275 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2276 I32 unwind = 0;
b7953727 2277#if 0
02db2b7b 2278 I32 firstcp = PL_savestack_ix;
b7953727 2279#endif
53c4c00c 2280 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2281#ifdef DEBUGGING
ce333219
JH
2282 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2283 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2284 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2285#endif
02db2b7b 2286
d6a28714
JH
2287#ifdef DEBUGGING
2288 PL_regindent++;
2289#endif
2290
2291 /* Note that nextchr is a byte even in UTF */
2292 nextchr = UCHARAT(locinput);
2293 scan = prog;
2294 while (scan != NULL) {
8ba1375e 2295
2a782b5b 2296 DEBUG_r( {
d6a28714
JH
2297 SV *prop = sv_newmortal();
2298 int docolor = *PL_colors[0];
2299 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2300 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2301 /* The part of the string before starttry has one color
2302 (pref0_len chars), between starttry and current
2303 position another one (pref_len - pref0_len chars),
2304 after the current position the third one.
2305 We assume that pref0_len <= pref_len, otherwise we
2306 decrease pref0_len. */
9041c2e3 2307 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2308 ? (5 + taill) - l : locinput - PL_bostr;
2309 int pref0_len;
d6a28714 2310
df1ffd02 2311 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2312 pref_len++;
2313 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2314 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2315 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2316 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2317 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2318 l--;
d6a28714
JH
2319 if (pref0_len < 0)
2320 pref0_len = 0;
2321 if (pref0_len > pref_len)
2322 pref0_len = pref_len;
2323 regprop(prop, scan);
2a782b5b
JH
2324 {
2325 char *s0 =
f14c76ed 2326 do_utf8 && OP(scan) != CANY ?
2a782b5b 2327 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2328 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2329 locinput - pref_len;
df1ffd02 2330 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2331 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2332 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2333 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2334 locinput - pref_len + pref0_len;
df1ffd02 2335 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2336 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2337 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2338 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2339 locinput;
df1ffd02 2340 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2341 PerlIO_printf(Perl_debug_log,
2342 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2343 (IV)(locinput - PL_bostr),
2344 PL_colors[4],
2345 len0, s0,
2346 PL_colors[5],
2347 PL_colors[2],
2348 len1, s1,
2349 PL_colors[3],
2350 (docolor ? "" : "> <"),
2351 PL_colors[0],
2352 len2, s2,
2353 PL_colors[1],
2354 15 - l - pref_len + 1,
2355 "",
2356 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2357 SvPVX(prop));
2358 }
2359 });
d6a28714
JH
2360
2361 next = scan + NEXT_OFF(scan);
2362 if (next == scan)
2363 next = NULL;
2364
2365 switch (OP(scan)) {
2366 case BOL:
12d33761
HS
2367 if (locinput == PL_bostr || (PL_multiline &&
2368 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2369 {
2370 /* regtill = regbol; */
b8c5462f
JH
2371 break;
2372 }
d6a28714
JH
2373 sayNO;
2374 case MBOL:
12d33761
HS
2375 if (locinput == PL_bostr ||
2376 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2377 {
b8c5462f
JH
2378 break;
2379 }
d6a28714
JH
2380 sayNO;
2381 case SBOL:
c2a73568 2382 if (locinput == PL_bostr)
b8c5462f 2383 break;
d6a28714
JH
2384 sayNO;
2385 case GPOS:
2386 if (locinput == PL_reg_ganch)
2387 break;
2388 sayNO;
2389 case EOL:
2390 if (PL_multiline)
2391 goto meol;
2392 else
2393 goto seol;
2394 case MEOL:
2395 meol:
2396 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2397 sayNO;
b8c5462f 2398 break;
d6a28714
JH
2399 case SEOL:
2400 seol:
2401 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2402 sayNO;
d6a28714 2403 if (PL_regeol - locinput > 1)
b8c5462f 2404 sayNO;
b8c5462f 2405 break;
d6a28714
JH
2406 case EOS:
2407 if (PL_regeol != locinput)
b8c5462f 2408 sayNO;
d6a28714 2409 break;
ffc61ed2 2410 case SANY:
d6a28714 2411 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2412 sayNO;
f33976b4
DB
2413 if (do_utf8) {
2414 locinput += PL_utf8skip[nextchr];
2415 if (locinput > PL_regeol)
2416 sayNO;
2417 nextchr = UCHARAT(locinput);
2418 }
2419 else
2420 nextchr = UCHARAT(++locinput);
2421 break;
2422 case CANY:
2423 if (!nextchr && locinput >= PL_regeol)
2424 sayNO;
b8c5462f 2425 nextchr = UCHARAT(++locinput);
a0d0e21e 2426 break;
ffc61ed2 2427 case REG_ANY:
1aa99e6b
IH
2428 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2429 sayNO;
2430 if (do_utf8) {
b8c5462f 2431 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2432 if (locinput > PL_regeol)
2433 sayNO;
a0ed51b3 2434 nextchr = UCHARAT(locinput);
a0ed51b3 2435 }
1aa99e6b
IH
2436 else
2437 nextchr = UCHARAT(++locinput);
a0ed51b3 2438 break;
d6a28714 2439 case EXACT:
cd439c50
IZ
2440 s = STRING(scan);
2441 ln = STR_LEN(scan);
eb160463 2442 if (do_utf8 != UTF) {
bc517b45 2443 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2444 char *l = locinput;
2445 char *e = s + ln;
bc517b45 2446 STRLEN ulen;
a72c7584 2447
5ff6fc6d
JH
2448 if (do_utf8) {
2449 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2450 while (s < e) {
2451 if (l >= PL_regeol)
5ff6fc6d
JH
2452 sayNO;
2453 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2454 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2455 ckWARN(WARN_UTF8) ?
2456 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2457 sayNO;
bc517b45 2458 l += ulen;
5ff6fc6d 2459 s ++;
1aa99e6b 2460 }
5ff6fc6d
JH
2461 }
2462 else {
2463 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2464 while (s < e) {
2465 if (l >= PL_regeol)
2466 sayNO;
5ff6fc6d 2467 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2468 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2469 ckWARN(WARN_UTF8) ?
2470 0 : UTF8_ALLOW_ANY))
1aa99e6b 2471 sayNO;
bc517b45 2472 s += ulen;
a72c7584 2473 l ++;
1aa99e6b 2474 }
5ff6fc6d 2475 }
1aa99e6b
IH
2476 locinput = l;
2477 nextchr = UCHARAT(locinput);
2478 break;
2479 }
bc517b45 2480 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2481 /* Inline the first character, for speed. */
2482 if (UCHARAT(s) != nextchr)
2483 sayNO;
2484 if (PL_regeol - locinput < ln)
2485 sayNO;
2486 if (ln > 1 && memNE(s, locinput, ln))
2487 sayNO;
2488 locinput += ln;
2489 nextchr = UCHARAT(locinput);
2490 break;
2491 case EXACTFL:
b8c5462f
JH
2492 PL_reg_flags |= RF_tainted;
2493 /* FALL THROUGH */
d6a28714 2494 case EXACTF:
cd439c50
IZ
2495 s = STRING(scan);
2496 ln = STR_LEN(scan);
d6a28714 2497
d07ddd77
JH
2498 if (do_utf8 || UTF) {
2499 /* Either target or the pattern are utf8. */
d6a28714 2500 char *l = locinput;
d07ddd77 2501 char *e = PL_regeol;
bc517b45 2502
eb160463 2503 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2504 l, &e, 0, do_utf8)) {
5486206c
JH
2505 /* One more case for the sharp s:
2506 * pack("U0U*", 0xDF) =~ /ss/i,
2507 * the 0xC3 0x9F are the UTF-8
2508 * byte sequence for the U+00DF. */
2509 if (!(do_utf8 &&
2510 toLOWER(s[0]) == 's' &&
2511 ln >= 2 &&
2512 toLOWER(s[1]) == 's' &&
2513 (U8)l[0] == 0xC3 &&
2514 e - l >= 2 &&
2515 (U8)l[1] == 0x9F))
2516 sayNO;
2517 }
d07ddd77
JH
2518 locinput = e;
2519 nextchr = UCHARAT(locinput);
2520 break;
a0ed51b3 2521 }
d6a28714 2522
bc517b45
JH
2523 /* Neither the target and the pattern are utf8. */
2524
d6a28714
JH
2525 /* Inline the first character, for speed. */
2526 if (UCHARAT(s) != nextchr &&
2527 UCHARAT(s) != ((OP(scan) == EXACTF)
2528 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2529 sayNO;
d6a28714 2530 if (PL_regeol - locinput < ln)
b8c5462f 2531 sayNO;
d6a28714
JH
2532 if (ln > 1 && (OP(scan) == EXACTF
2533 ? ibcmp(s, locinput, ln)
2534 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2535 sayNO;
d6a28714
JH
2536 locinput += ln;
2537 nextchr = UCHARAT(locinput);
a0d0e21e 2538 break;
d6a28714 2539 case ANYOF:
ffc61ed2 2540 if (do_utf8) {
9e55ce06
JH
2541 STRLEN inclasslen = PL_regeol - locinput;
2542
ba7b4546 2543 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2544 sayNO_ANYOF;
ffc61ed2
JH
2545 if (locinput >= PL_regeol)
2546 sayNO;
0f0076b4 2547 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2548 nextchr = UCHARAT(locinput);
e0f9d4a8 2549 break;
ffc61ed2
JH
2550 }
2551 else {
2552 if (nextchr < 0)
2553 nextchr = UCHARAT(locinput);
7d3e948e 2554 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2555 sayNO_ANYOF;
ffc61ed2
JH
2556 if (!nextchr && locinput >= PL_regeol)
2557 sayNO;
2558 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2559 break;
2560 }
2561 no_anyof:
2562 /* If we might have the case of the German sharp s
2563 * in a casefolding Unicode character class. */
2564
ebc501f0
JH
2565 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2566 locinput += SHARP_S_SKIP;
e0f9d4a8 2567 nextchr = UCHARAT(locinput);
ffc61ed2 2568 }
e0f9d4a8
JH
2569 else
2570 sayNO;
b8c5462f 2571 break;
d6a28714 2572 case ALNUML:
b8c5462f
JH
2573 PL_reg_flags |= RF_tainted;
2574 /* FALL THROUGH */
d6a28714 2575 case ALNUM:
b8c5462f 2576 if (!nextchr)
4633a7c4 2577 sayNO;
ffc61ed2 2578 if (do_utf8) {
ad24be35 2579 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2580 if (!(OP(scan) == ALNUM
3568d838 2581 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2582 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2583 {
2584 sayNO;
a0ed51b3 2585 }
b8c5462f 2586 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2587 nextchr = UCHARAT(locinput);
2588 break;
2589 }
ffc61ed2 2590 if (!(OP(scan) == ALNUM
d6a28714 2591 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2592 sayNO;
b8c5462f 2593 nextchr = UCHARAT(++locinput);
a0d0e21e 2594 break;
d6a28714 2595 case NALNUML:
b8c5462f
JH
2596 PL_reg_flags |= RF_tainted;
2597 /* FALL THROUGH */
d6a28714
JH
2598 case NALNUM:
2599 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2600 sayNO;
ffc61ed2 2601 if (do_utf8) {
8269fa76 2602 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2603 if (OP(scan) == NALNUM
3568d838 2604 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2605 : isALNUM_LC_utf8((U8*)locinput))
2606 {
b8c5462f 2607 sayNO;
d6a28714 2608 }
b8c5462f
JH
2609 locinput += PL_utf8skip[nextchr];
2610 nextchr = UCHARAT(locinput);
2611 break;
2612 }
ffc61ed2 2613 if (OP(scan) == NALNUM
d6a28714 2614 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2615 sayNO;
76e3520e 2616 nextchr = UCHARAT(++locinput);
a0d0e21e 2617 break;
d6a28714
JH
2618 case BOUNDL:
2619 case NBOUNDL:
3280af22 2620 PL_reg_flags |= RF_tainted;
bbce6d69 2621 /* FALL THROUGH */
d6a28714
JH
2622 case BOUND:
2623 case NBOUND:
2624 /* was last char in word? */
ffc61ed2 2625 if (do_utf8) {
12d33761
HS
2626 if (locinput == PL_bostr)
2627 ln = '\n';
ffc61ed2 2628 else {
b4f7163a 2629 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2630
b4f7163a 2631 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2632 }
2633 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2634 ln = isALNUM_uni(ln);
8269fa76 2635 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2636 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2637 }
2638 else {
9041c2e3 2639 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2640 n = isALNUM_LC_utf8((U8*)locinput);
2641 }
a0ed51b3 2642 }
d6a28714 2643 else {
12d33761
HS
2644 ln = (locinput != PL_bostr) ?
2645 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2646 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2647 ln = isALNUM(ln);
2648 n = isALNUM(nextchr);
2649 }
2650 else {
2651 ln = isALNUM_LC(ln);
2652 n = isALNUM_LC(nextchr);
2653 }
d6a28714 2654 }
ffc61ed2
JH
2655 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2656 OP(scan) == BOUNDL))
2657 sayNO;
a0ed51b3 2658 break;
d6a28714 2659 case SPACEL:
3280af22 2660 PL_reg_flags |= RF_tainted;
bbce6d69 2661 /* FALL THROUGH */
d6a28714 2662 case SPACE:
9442cb0e 2663 if (!nextchr)
4633a7c4 2664 sayNO;
1aa99e6b 2665 if (do_utf8) {
fd400ab9 2666 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2667 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2668 if (!(OP(scan) == SPACE
3568d838 2669 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2670 : isSPACE_LC_utf8((U8*)locinput)))
2671 {
2672 sayNO;
2673 }
2674 locinput += PL_utf8skip[nextchr];
2675 nextchr = UCHARAT(locinput);
2676 break;
d6a28714 2677 }
ffc61ed2
JH
2678 if (!(OP(scan) == SPACE
2679 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2680 sayNO;
2681 nextchr = UCHARAT(++locinput);
2682 }
2683 else {
2684 if (!(OP(scan) == SPACE
2685 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2686 sayNO;
2687 nextchr = UCHARAT(++locinput);
a0ed51b3 2688 }
a0ed51b3 2689 break;
d6a28714 2690 case NSPACEL:
3280af22 2691 PL_reg_flags |= RF_tainted;
bbce6d69 2692 /* FALL THROUGH */
d6a28714 2693 case NSPACE:
9442cb0e 2694 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2695 sayNO;
1aa99e6b 2696 if (do_utf8) {
8269fa76 2697 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2698 if (OP(scan) == NSPACE
3568d838 2699 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2700 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2701 {
2702 sayNO;
2703 }
2704 locinput += PL_utf8skip[nextchr];
2705 nextchr = UCHARAT(locinput);
2706 break;
a0ed51b3 2707 }
ffc61ed2 2708 if (OP(scan) == NSPACE
d6a28714 2709 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2710 sayNO;
76e3520e 2711 nextchr = UCHARAT(++locinput);
a0d0e21e 2712 break;
d6a28714 2713 case DIGITL:
a0ed51b3
LW
2714 PL_reg_flags |= RF_tainted;
2715 /* FALL THROUGH */
d6a28714 2716 case DIGIT:
9442cb0e 2717 if (!nextchr)
a0ed51b3 2718 sayNO;
1aa99e6b 2719 if (do_utf8) {
8269fa76 2720 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2721 if (!(OP(scan) == DIGIT
3568d838 2722 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2723 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2724 {
a0ed51b3 2725 sayNO;
dfe13c55 2726 }
6f06b55f 2727 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2728 nextchr = UCHARAT(locinput);
2729 break;
2730 }
ffc61ed2 2731 if (!(OP(scan) == DIGIT
9442cb0e 2732 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2733 sayNO;
2734 nextchr = UCHARAT(++locinput);
2735 break;
d6a28714 2736 case NDIGITL:
b8c5462f
JH
2737 PL_reg_flags |= RF_tainted;
2738 /* FALL THROUGH */
d6a28714 2739 case NDIGIT:
9442cb0e 2740 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2741 sayNO;
1aa99e6b 2742 if (do_utf8) {
8269fa76 2743 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2744 if (OP(scan) == NDIGIT
3568d838 2745 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2746 : isDIGIT_LC_utf8((U8*)locinput))
2747 {
a0ed51b3 2748 sayNO;
9442cb0e 2749 }
6f06b55f 2750 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2751 nextchr = UCHARAT(locinput);
2752 break;
2753 }
ffc61ed2 2754 if (OP(scan) == NDIGIT
9442cb0e 2755 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2756 sayNO;
2757 nextchr = UCHARAT(++locinput);
2758 break;
2759 case CLUMP:
b7c83a7e 2760 if (locinput >= PL_regeol)
a0ed51b3 2761 sayNO;
b7c83a7e
JH
2762 if (do_utf8) {
2763 LOAD_UTF8_CHARCLASS(mark,"~");
2764 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2765 sayNO;
2766 locinput += PL_utf8skip[nextchr];
2767 while (locinput < PL_regeol &&
2768 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2769 locinput += UTF8SKIP(locinput);
2770 if (locinput > PL_regeol)
2771 sayNO;
eb08e2da
JH
2772 }
2773 else
2774 locinput++;
a0ed51b3
LW
2775 nextchr = UCHARAT(locinput);
2776 break;
c8756f30 2777 case REFFL:
3280af22 2778 PL_reg_flags |= RF_tainted;
c8756f30 2779 /* FALL THROUGH */
c277df42 2780 case REF:
c8756f30 2781 case REFF:
c277df42 2782 n = ARG(scan); /* which paren pair */
cf93c79d 2783 ln = PL_regstartp[n];
2c2d71f5 2784 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2785 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2786 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2787 if (ln == PL_regendp[n])
a0d0e21e 2788 break;
a0ed51b3 2789
cf93c79d 2790 s = PL_bostr + ln;
1aa99e6b 2791 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2792 char *l = locinput;
cf93c79d 2793 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2794 /*
2795 * Note that we can't do the "other character" lookup trick as
2796 * in the 8-bit case (no pun intended) because in Unicode we
2797 * have to map both upper and title case to lower case.
2798 */
2799 if (OP(scan) == REFF) {
a2a2844f 2800 STRLEN ulen1, ulen2;
e7ae6809
JH
2801 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2802 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2803 while (s < e) {
2804 if (l >= PL_regeol)
2805 sayNO;
a2a2844f
JH
2806 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2807 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2808 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2809 sayNO;
a2a2844f
JH
2810 s += ulen1;
2811 l += ulen2;
a0ed51b3
LW
2812 }
2813 }
2814 locinput = l;
2815 nextchr = UCHARAT(locinput);
2816 break;
2817 }
2818
a0d0e21e 2819 /* Inline the first character, for speed. */
76e3520e 2820 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2821 (OP(scan) == REF ||
2822 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2823 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2824 sayNO;
cf93c79d 2825 ln = PL_regendp[n] - ln;
3280af22 2826 if (locinput + ln > PL_regeol)
4633a7c4 2827 sayNO;
c8756f30
AK
2828 if (ln > 1 && (OP(scan) == REF
2829 ? memNE(s, locinput, ln)
2830 : (OP(scan) == REFF
2831 ? ibcmp(s, locinput, ln)
2832 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2833 sayNO;
a0d0e21e 2834 locinput += ln;
76e3520e 2835 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2836 break;
2837
2838 case NOTHING:
c277df42 2839 case TAIL:
a0d0e21e
LW
2840 break;
2841 case BACK:
2842 break;
c277df42
IZ
2843 case EVAL:
2844 {
2845 dSP;
533c011a 2846 OP_4tree *oop = PL_op;
3280af22 2847 COP *ocurcop = PL_curcop;
f3548bdc 2848 PAD *old_comppad;
c277df42 2849 SV *ret;
080c2dec 2850 struct regexp *oreg = PL_reg_re;
9041c2e3 2851
c277df42 2852 n = ARG(scan);
533c011a 2853 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2854 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2855 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2856 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2857
8e5e9ebe
RGS
2858 {
2859 SV **before = SP;
2860 CALLRUNOPS(aTHX); /* Scalar context. */
2861 SPAGAIN;
2862 if (SP == before)
075aa684 2863 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2864 else {
2865 ret = POPs;
2866 PUTBACK;
2867 }
2868 }
2869
0f5d15d6 2870 PL_op = oop;
f3548bdc 2871 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2872 PL_curcop = ocurcop;
c277df42 2873 if (logical) {
0f5d15d6
IZ
2874 if (logical == 2) { /* Postponed subexpression. */
2875 regexp *re;
22c35a8c 2876 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2877 re_cc_state state;
0f5d15d6 2878 CHECKPOINT cp, lastcp;
cb50f42d 2879 int toggleutf;
faf82a0b 2880 register SV *sv;
0f5d15d6 2881
faf82a0b
AE
2882 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2883 mg = mg_find(sv, PERL_MAGIC_qr);
2884 else if (SvSMAGICAL(ret)) {
2885 if (SvGMAGICAL(ret))
2886 sv_unmagic(ret, PERL_MAGIC_qr);
2887 else
2888 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2889 }
faf82a0b 2890
0f5d15d6
IZ
2891 if (mg) {
2892 re = (regexp *)mg->mg_obj;
df0003d4 2893 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2894 }
2895 else {
2896 STRLEN len;
2897 char *t = SvPV(ret, len);
2898 PMOP pm;
2899 char *oprecomp = PL_regprecomp;
2900 I32 osize = PL_regsize;
2901 I32 onpar = PL_regnpar;
2902
5fcd1c1b 2903 Zero(&pm, 1, PMOP);
cb50f42d 2904 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2905 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2906 if (!(SvFLAGS(ret)
faf82a0b
AE
2907 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2908 | SVs_GMG)))
14befaf4
DM
2909 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2910 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2911 PL_regprecomp = oprecomp;
2912 PL_regsize = osize;
2913 PL_regnpar = onpar;
2914 }
2915 DEBUG_r(
9041c2e3 2916 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2917 "Entering embedded `%s%.60s%s%s'\n",
2918 PL_colors[0],
2919 re->precomp,
2920 PL_colors[1],
2921 (strlen(re->precomp) > 60 ? "..." : ""))
2922 );
2923 state.node = next;
2924 state.prev = PL_reg_call_cc;
2925 state.cc = PL_regcc;
2926 state.re = PL_reg_re;
2927
2ab05381 2928 PL_regcc = 0;
9041c2e3 2929
0f5d15d6 2930 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2931 REGCP_SET(lastcp);
0f5d15d6
IZ
2932 cache_re(re);
2933 state.ss = PL_savestack_ix;
2934 *PL_reglastparen = 0;
a01268b5 2935 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2936 PL_reg_call_cc = &state;
2937 PL_reginput = locinput;
cb50f42d
YST
2938 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2939 ((re->reganch & ROPT_UTF8) != 0);
2940 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2941
2942 /* XXXX This is too dramatic a measure... */
2943 PL_reg_maxiter = 0;
2944
0f5d15d6 2945 if (regmatch(re->program + 1)) {
2c914db6
IZ
2946 /* Even though we succeeded, we need to restore
2947 global variables, since we may be wrapped inside
2948 SUSPEND, thus the match may be not finished yet. */
2949
2950 /* XXXX Do this only if SUSPENDed? */
2951 PL_reg_call_cc = state.prev;
2952 PL_regcc = state.cc;
2953 PL_reg_re = state.re;
2954 cache_re(PL_reg_re);
cb50f42d 2955 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2956
2957 /* XXXX This is too dramatic a measure... */
2958 PL_reg_maxiter = 0;
2959
2960 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2961 ReREFCNT_dec(re);
2962 regcpblow(cp);
2963 sayYES;
2964 }
0f5d15d6 2965 ReREFCNT_dec(re);
02db2b7b 2966 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2967 regcppop();
2968 PL_reg_call_cc = state.prev;
2969 PL_regcc = state.cc;
2970 PL_reg_re = state.re;
d3790889 2971 cache_re(PL_reg_re);
cb50f42d 2972 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2973
2974 /* XXXX This is too dramatic a measure... */
2975 PL_reg_maxiter = 0;
2976
8e514ae6 2977 logical = 0;
0f5d15d6
IZ
2978 sayNO;
2979 }
c277df42 2980 sw = SvTRUE(ret);
0f5d15d6 2981 logical = 0;
a0ed51b3 2982 }
080c2dec 2983 else {
3280af22 2984 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
2985 cache_re(oreg);
2986 }
c277df42
IZ
2987 break;
2988 }
a0d0e21e 2989 case OPEN:
c277df42 2990 n = ARG(scan); /* which paren pair */
3280af22
NIS
2991 PL_reg_start_tmp[n] = locinput;
2992 if (n > PL_regsize)
2993 PL_regsize = n;
a0d0e21e
LW
2994 break;
2995 case CLOSE:
c277df42 2996 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2997 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2998 PL_regendp[n] = locinput - PL_bostr;
eb160463 2999 if (n > (I32)*PL_reglastparen)
3280af22 3000 *PL_reglastparen = n;
a01268b5 3001 *PL_reglastcloseparen = n;
a0d0e21e 3002 break;
c277df42
IZ
3003 case GROUPP:
3004 n = ARG(scan); /* which paren pair */
eb160463 3005 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3006 break;
3007 case IFTHEN:
2c2d71f5 3008 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3009 if (sw)
3010 next = NEXTOPER(NEXTOPER(scan));
3011 else {
3012 next = scan + ARG(scan);
3013 if (OP(next) == IFTHEN) /* Fake one. */
3014 next = NEXTOPER(NEXTOPER(next));
3015 }
3016 break;
3017 case LOGICAL:
0f5d15d6 3018 logical = scan->flags;
c277df42 3019 break;
2ab05381
IZ
3020/*******************************************************************
3021 PL_regcc contains infoblock about the innermost (...)* loop, and
3022 a pointer to the next outer infoblock.
3023
3024 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3025
3026 1) After matching X, regnode for CURLYX is processed;
3027
9041c2e3 3028 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3029 regmatch() recursively with the starting point at WHILEM node;
3030
3031 3) Each hit of WHILEM node tries to match A and Z (in the order
3032 depending on the current iteration, min/max of {min,max} and
3033 greediness). The information about where are nodes for "A"
3034 and "Z" is read from the infoblock, as is info on how many times "A"
3035 was already matched, and greediness.
3036
3037 4) After A matches, the same WHILEM node is hit again.
3038
3039 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3040 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3041 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3042 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3043 of the external loop.
3044
3045 Currently present infoblocks form a tree with a stem formed by PL_curcc
3046 and whatever it mentions via ->next, and additional attached trees
3047 corresponding to temporarily unset infoblocks as in "5" above.
3048
9041c2e3 3049 In the following picture infoblocks for outer loop of
2ab05381
IZ
3050 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3051 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3052 infoblocks are drawn below the "reset" infoblock.
3053
3054 In fact in the picture below we do not show failed matches for Z and T
3055 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3056 more obvious *why* one needs to *temporary* unset infoblocks.]
3057
3058 Matched REx position InfoBlocks Comment
3059 (Y(A)*?Z)*?T x
3060 Y(A)*?Z)*?T x <- O
3061 Y (A)*?Z)*?T x <- O
3062 Y A)*?Z)*?T x <- O <- I
3063 YA )*?Z)*?T x <- O <- I
3064 YA A)*?Z)*?T x <- O <- I
3065 YAA )*?Z)*?T x <- O <- I
3066 YAA Z)*?T x <- O # Temporary unset I
3067 I
3068
3069 YAAZ Y(A)*?Z)*?T x <- O
3070 I
3071
3072 YAAZY (A)*?Z)*?T x <- O
3073 I
3074
3075 YAAZY A)*?Z)*?T x <- O <- I
3076 I
3077
3078 YAAZYA )*?Z)*?T x <- O <- I
3079 I
3080
3081 YAAZYA Z)*?T x <- O # Temporary unset I
3082 I,I
3083
3084 YAAZYAZ )*?T x <- O
3085 I,I
3086
3087 YAAZYAZ T x # Temporary unset O
3088 O
3089 I,I
3090
3091 YAAZYAZT x
3092 O
3093 I,I
3094 *******************************************************************/
a0d0e21e
LW
3095 case CURLYX: {
3096 CURCUR cc;
3280af22 3097 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3098 /* No need to save/restore up to this paren */
3099 I32 parenfloor = scan->flags;
c277df42
IZ
3100
3101 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3102 next += ARG(next);
3280af22
NIS
3103 cc.oldcc = PL_regcc;
3104 PL_regcc = &cc;
cb434fcc
IZ
3105 /* XXXX Probably it is better to teach regpush to support
3106 parenfloor > PL_regsize... */
eb160463 3107 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3108 parenfloor = *PL_reglastparen; /* Pessimization... */
3109 cc.parenfloor = parenfloor;
a0d0e21e
LW
3110 cc.cur = -1;
3111 cc.min = ARG1(scan);
3112 cc.max = ARG2(scan);
c277df42 3113 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3114 cc.next = next;
3115 cc.minmod = minmod;
3116 cc.lastloc = 0;
3280af22 3117 PL_reginput = locinput;
a0d0e21e
LW
3118 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3119 regcpblow(cp);
3280af22 3120 PL_regcc = cc.oldcc;
4633a7c4 3121 saySAME(n);
a0d0e21e
LW
3122 }
3123 /* NOT REACHED */
3124 case WHILEM: {
3125 /*
3126 * This is really hard to understand, because after we match
3127 * what we're trying to match, we must make sure the rest of
2c2d71f5 3128 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3129 * to go back UP the parse tree by recursing ever deeper. And
3130 * if it fails, we have to reset our parent's current state
3131 * that we can try again after backing off.
3132 */
3133
c277df42 3134 CHECKPOINT cp, lastcp;
3280af22 3135 CURCUR* cc = PL_regcc;
c277df42
IZ
3136 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3137
4633a7c4 3138 n = cc->cur + 1; /* how many we know we matched */
3280af22 3139 PL_reginput = locinput;
a0d0e21e 3140
c277df42 3141 DEBUG_r(
9041c2e3 3142 PerlIO_printf(Perl_debug_log,
91f3b821 3143 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3144 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3145 (long)n, (long)cc->min,
2797576d 3146 (long)cc->max, PTR2UV(cc))
c277df42 3147 );
4633a7c4 3148
a0d0e21e
LW
3149 /* If degenerate scan matches "", assume scan done. */
3150
579cf2c3 3151 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3152 PL_regcc = cc->oldcc;
2ab05381
IZ
3153 if (PL_regcc)
3154 ln = PL_regcc->cur;
c277df42 3155 DEBUG_r(
c3464db5
DD
3156 PerlIO_printf(Perl_debug_log,
3157 "%*s empty match detected, try continuation...\n",
3280af22 3158 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3159 );
a0d0e21e 3160 if (regmatch(cc->next))
4633a7c4 3161 sayYES;
2ab05381
IZ
3162 if (PL_regcc)
3163 PL_regcc->cur = ln;
3280af22 3164 PL_regcc = cc;
4633a7c4 3165 sayNO;
a0d0e21e
LW
3166 }
3167
3168 /* First just match a string of min scans. */
3169
3170 if (n < cc->min) {
3171 cc->cur = n;
3172 cc->lastloc = locinput;
4633a7c4
LW
3173 if (regmatch(cc->scan))
3174 sayYES;
3175 cc->cur = n - 1;
c277df42 3176 cc->lastloc = lastloc;
4633a7c4 3177 sayNO;
a0d0e21e
LW
3178 }
3179
2c2d71f5
JH
3180 if (scan->flags) {
3181 /* Check whether we already were at this position.
3182 Postpone detection until we know the match is not
3183 *that* much linear. */
3184 if (!PL_reg_maxiter) {
3185 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3186 PL_reg_leftiter = PL_reg_maxiter;
3187 }
3188 if (PL_reg_leftiter-- == 0) {
3189 I32 size = (PL_reg_maxiter + 7)/8;
3190 if (PL_reg_poscache) {
eb160463 3191 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3192 Renew(PL_reg_poscache, size, char);
3193 PL_reg_poscache_size = size;
3194 }
3195 Zero(PL_reg_poscache, size, char);
3196 }
3197 else {
3198 PL_reg_poscache_size = size;
3199 Newz(29, PL_reg_poscache, size, char);
3200 }
3201 DEBUG_r(
3202 PerlIO_printf(Perl_debug_log,
3203 "%sDetected a super-linear match, switching on caching%s...\n",
3204 PL_colors[4], PL_colors[5])
3205 );
3206 }
3207 if (PL_reg_leftiter < 0) {
3208 I32 o = locinput - PL_bostr, b;
3209
3210 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3211 b = o % 8;
3212 o /= 8;
3213 if (PL_reg_poscache[o] & (1<<b)) {
3214 DEBUG_r(
3215 PerlIO_printf(Perl_debug_log,
3216 "%*s already tried at this position...\n",
3217 REPORT_CODE_OFF+PL_regindent*2, "")
3218 );
c2b0868c
HS
3219 if (PL_reg_flags & RF_false)
3220 sayYES;
3221 else
3222 sayNO_SILENT;
2c2d71f5
JH
3223 }
3224 PL_reg_poscache[o] |= (1<<b);
3225 }
3226 }
3227
a0d0e21e
LW
3228 /* Prefer next over scan for minimal matching. */
3229
3230 if (cc->minmod) {
3280af22 3231 PL_regcc = cc->oldcc;
2ab05381
IZ
3232 if (PL_regcc)
3233 ln = PL_regcc->cur;
5f05dabc 3234 cp = regcppush(cc->parenfloor);
02db2b7b 3235 REGCP_SET(lastcp);
5f05dabc 3236 if (regmatch(cc->next)) {
c277df42 3237 regcpblow(cp);
4633a7c4 3238 sayYES; /* All done. */
5f05dabc 3239 }
02db2b7b 3240 REGCP_UNWIND(lastcp);
5f05dabc 3241 regcppop();
2ab05381
IZ
3242 if (PL_regcc)
3243 PL_regcc->cur = ln;
3280af22 3244 PL_regcc = cc;
a0d0e21e 3245
c277df42 3246 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3247 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3248 && !(PL_reg_flags & RF_warned)) {
3249 PL_reg_flags |= RF_warned;
9014280d 3250 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3251 "Complex regular subexpression recursion",
3252 REG_INFTY - 1);
c277df42 3253 }
4633a7c4 3254 sayNO;
c277df42 3255 }
a687059c 3256
c277df42 3257 DEBUG_r(
c3464db5
DD
3258 PerlIO_printf(Perl_debug_log,
3259 "%*s trying longer...\n",
3280af22 3260 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3261 );
a0d0e21e 3262 /* Try scanning more and see if it helps. */
3280af22 3263 PL_reginput = locinput;
a0d0e21e
LW
3264 cc->cur = n;
3265 cc->lastloc = locinput;
5f05dabc 3266 cp = regcppush(cc->parenfloor);
02db2b7b 3267 REGCP_SET(lastcp);
5f05dabc 3268 if (regmatch(cc->scan)) {
c277df42 3269 regcpblow(cp);
4633a7c4 3270 sayYES;
5f05dabc 3271 }
02db2b7b 3272 REGCP_UNWIND(lastcp);
5f05dabc 3273 regcppop();
4633a7c4 3274 cc->cur = n - 1;
c277df42 3275 cc->lastloc = lastloc;
4633a7c4 3276 sayNO;
a0d0e21e
LW
3277 }
3278
3279 /* Prefer scan over next for maximal matching. */
3280
3281 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3282 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3283 cc->cur = n;
3284 cc->lastloc = locinput;
02db2b7b 3285 REGCP_SET(lastcp);
5f05dabc 3286 if (regmatch(cc->scan)) {
c277df42 3287 regcpblow(cp);
4633a7c4 3288 sayYES;
5f05dabc 3289 }
02db2b7b 3290 REGCP_UNWIND(lastcp);
a0d0e21e 3291 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3292 PL_reginput = locinput;
c277df42 3293 DEBUG_r(
c3464db5
DD
3294 PerlIO_printf(Perl_debug_log,
3295 "%*s failed, try continuation...\n",
3280af22 3296 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3297 );
3298 }
9041c2e3 3299 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3300 && !(PL_reg_flags & RF_warned)) {
3280af22 3301 PL_reg_flags |= RF_warned;
9014280d 3302 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3303 "Complex regular subexpression recursion",
3304 REG_INFTY - 1);
a0d0e21e
LW
3305 }
3306
3307 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3308 PL_regcc = cc->oldcc;
2ab05381
IZ
3309 if (PL_regcc)
3310 ln = PL_regcc->cur;
a0d0e21e 3311 if (regmatch(cc->next))
4633a7c4 3312 sayYES;
2ab05381
IZ
3313 if (PL_regcc)
3314 PL_regcc->cur = ln;
3280af22 3315 PL_regcc = cc;
4633a7c4 3316 cc->cur = n - 1;
c277df42 3317 cc->lastloc = lastloc;
4633a7c4 3318 sayNO;
a0d0e21e
LW
3319 }
3320 /* NOT REACHED */
9041c2e3 3321 case BRANCHJ:
c277df42
IZ
3322 next = scan + ARG(scan);
3323 if (next == scan)
3324 next = NULL;
3325 inner = NEXTOPER(NEXTOPER(scan));
3326 goto do_branch;
9041c2e3 3327 case BRANCH:
c277df42
IZ
3328 inner = NEXTOPER(scan);
3329 do_branch:
3330 {
c277df42
IZ
3331 c1 = OP(scan);
3332 if (OP(next) != c1) /* No choice. */
3333 next = inner; /* Avoid recursion. */
a0d0e21e 3334 else {
02db2b7b
IZ
3335 I32 lastparen = *PL_reglastparen;
3336 I32 unwind1;
3337 re_unwind_branch_t *uw;
3338
3339 /* Put unwinding data on stack */
3340 unwind1 = SSNEWt(1,re_unwind_branch_t);
3341 uw = SSPTRt(unwind1,re_unwind_branch_t);
3342 uw->prev = unwind;
3343 unwind = unwind1;
3344 uw->type = ((c1 == BRANCH)
3345 ? RE_UNWIND_BRANCH
3346 : RE_UNWIND_BRANCHJ);
3347 uw->lastparen = lastparen;
3348 uw->next = next;
3349 uw->locinput = locinput;
3350 uw->nextchr = nextchr;
3351#ifdef DEBUGGING
3352 uw->regindent = ++PL_regindent;
3353#endif
c277df42 3354
02db2b7b
IZ
3355 REGCP_SET(uw->lastcp);
3356
3357 /* Now go into the first branch */
3358 next = inner;
a687059c 3359 }
a0d0e21e
LW
3360 }
3361 break;
3362 case MINMOD:
3363 minmod = 1;
3364 break;
c277df42
IZ
3365 case CURLYM:
3366 {
00db4c45 3367 I32 l = 0;
c277df42 3368 CHECKPOINT lastcp;
9041c2e3 3369
c277df42 3370 /* We suppose that the next guy does not need
0e788c72 3371 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3372 and has no parenths to influence future backrefs. */
3373 ln = ARG1(scan); /* min to match */
3374 n = ARG2(scan); /* max to match */
c277df42
IZ
3375 paren = scan->flags;
3376 if (paren) {
3280af22
NIS
3377 if (paren > PL_regsize)
3378 PL_regsize = paren;
eb160463 3379 if (paren > (I32)*PL_reglastparen)
3280af22 3380 *PL_reglastparen = paren;
c277df42 3381 }
dc45a647 3382 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3383 if (paren)
3384 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3385 PL_reginput = locinput;
c277df42
IZ
3386 if (minmod) {
3387 minmod = 0;
3388 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3389 sayNO;
3280af22 3390 locinput = PL_reginput;
cca55fe3 3391 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3392 regnode *text_node = next;
3393
cca55fe3 3394 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3395
cca55fe3 3396 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3397 else {
cca55fe3 3398 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3399 c1 = c2 = -1000;
3400 goto assume_ok_MM;
cca55fe3
JP
3401 }
3402 else { c1 = (U8)*STRING(text_node); }
af5decee 3403 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3404 c2 = PL_fold[c1];
af5decee 3405 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3406 c2 = PL_fold_locale[c1];
3407 else
3408 c2 = c1;
3409 }
a0ed51b3
LW
3410 }
3411 else
c277df42 3412 c1 = c2 = -1000;
cca55fe3 3413 assume_ok_MM:
02db2b7b 3414 REGCP_SET(lastcp);
0e788c72 3415 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3416 /* If it could work, try it. */
3417 if (c1 == -1000 ||
3280af22
NIS
3418 UCHARAT(PL_reginput) == c1 ||
3419 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3420 {
3421 if (paren) {
f31a99c8 3422 if (ln) {
cf93c79d
IZ
3423 PL_regstartp[paren] =
3424 HOPc(PL_reginput, -l) - PL_bostr;
3425 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3426 }
3427 else
cf93c79d 3428 PL_regendp[paren] = -1;
c277df42
IZ
3429 }
3430 if (regmatch(next))
3431 sayYES;
02db2b7b 3432 REGCP_UNWIND(lastcp);
c277df42
IZ
3433 }
3434 /* Couldn't or didn't -- move forward. */
3280af22 3435 PL_reginput = locinput;
c277df42
IZ
3436 if (regrepeat_hard(scan, 1, &l)) {
3437 ln++;
3280af22 3438 locinput = PL_reginput;
c277df42
IZ
3439 }
3440 else
3441 sayNO;
3442 }
a0ed51b3
LW
3443 }
3444 else {
c277df42 3445 n = regrepeat_hard(scan, n, &l);
3280af22 3446 locinput = PL_reginput;
c277df42 3447 DEBUG_r(
5c0ca799 3448 PerlIO_printf(Perl_debug_log,
faccc32b 3449 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3450 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3451 (IV) n, (IV)l)
c277df42
IZ
3452 );
3453 if (n >= ln) {
cca55fe3 3454 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3455 regnode *text_node = next;
3456
cca55fe3 3457 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3458
cca55fe3 3459 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3460 else {
cca55fe3 3461 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3462 c1 = c2 = -1000;
3463 goto assume_ok_REG;
cca55fe3
JP
3464 }
3465 else { c1 = (U8)*STRING(text_node); }
3466
af5decee 3467 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3468 c2 = PL_fold[c1];
af5decee 3469 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3470 c2 = PL_fold_locale[c1];
3471 else
3472 c2 = c1;
3473 }
a0ed51b3
LW
3474 }
3475 else
c277df42
IZ
3476 c1 = c2 = -1000;
3477 }
cca55fe3 3478 assume_ok_REG:
02db2b7b 3479 REGCP_SET(lastcp);
c277df42
IZ
3480 while (n >= ln) {
3481 /* If it could work, try it. */
3482 if (c1 == -1000 ||
3280af22
NIS
3483 UCHARAT(PL_reginput) == c1 ||
3484 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3485 {
3486 DEBUG_r(
c3464db5 3487 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3488 "%*s trying tail with n=%"IVdf"...\n",
3489 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3490 );
3491 if (paren) {
3492 if (n) {
cf93c79d
IZ
3493 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3494 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3495 }
a0ed51b3 3496 else
cf93c79d 3497 PL_regendp[paren] = -1;
c277df42 3498 }
a0ed51b3
LW
3499 if (regmatch(next))
3500 sayYES;
02db2b7b 3501 REGCP_UNWIND(lastcp);
a0ed51b3 3502 }
c277df42
IZ
3503 /* Couldn't or didn't -- back up. */
3504 n--;
dfe13c55 3505 locinput = HOPc(locinput, -l);
3280af22 3506 PL_reginput = locinput;
c277df42
IZ
3507 }
3508 }
3509 sayNO;
3510 break;
3511 }
3512 case CURLYN:
3513 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3514 if (paren > PL_regsize)
3515 PL_regsize = paren;
eb160463 3516 if (paren > (I32)*PL_reglastparen)
3280af22 3517<