This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The "Setuid script not plain file" error wasn't documented.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b5f8cc5c 82 **** 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
a687059c
LW
86 *
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
90 */
91#include "EXTERN.h"
864dbfa3 92#define PERL_IN_REGEXEC_C
a687059c 93#include "perl.h"
0f5d15d6 94
a687059c
LW
95#include "regcomp.h"
96
c277df42
IZ
97#define RF_tainted 1 /* tainted information used? */
98#define RF_warned 2 /* warned about big count? */
ce862d02 99#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3 100#define RF_utf8 8 /* String contains multibyte chars? */
c2b0868c 101#define RF_false 16 /* odd number of nested negatives */
a0ed51b3 102
eb160463 103#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
104
105#define RS_init 1 /* eval environment created */
106#define RS_set 2 /* replsv value is set */
c277df42 107
a687059c
LW
108#ifndef STATIC
109#define STATIC static
110#endif
111
ba7b4546 112#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 113
c277df42
IZ
114/*
115 * Forwards.
116 */
117
33b8afdf 118#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 119#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 120
dfe13c55
GS
121#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
122#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
123#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
124#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
125#define HOPc(pos,off) ((char*)HOP(pos,off))
126#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 127
efb30f32 128#define HOPBACK(pos, off) ( \
e54858b0 129 (PL_reg_match_utf8) \
efb30f32
HS
130 ? reghopmaybe((U8*)pos, -off) \
131 : (pos - off >= PL_bostr) \
132 ? (U8*)(pos - off) \
133 : (U8*)NULL \
134)
135#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
136
1aa99e6b
IH
137#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
138#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
139#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
140#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
141#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
142#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
143
977e4b84 144#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 145
5f80c4cf 146/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
147#define JUMPABLE(rn) ( \
148 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
149 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
150 OP(rn) == PLUS || OP(rn) == MINMOD || \
151 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
152)
153
cca55fe3
JP
154#define HAS_TEXT(rn) ( \
155 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
156)
e2d8ce26 157
a84d97b6
HS
158/*
159 Search for mandatory following text node; for lookahead, the text must
160 follow but for lookbehind (rn->flags != 0) we skip to the next step.
161*/
cca55fe3 162#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 163 while (JUMPABLE(rn)) \
a84d97b6 164 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 165 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
166 else if (OP(rn) == PLUS) \
167 rn = NEXTOPER(rn); \
a84d97b6
HS
168 else if (OP(rn) == IFMATCH) \
169 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 170 else rn += NEXT_OFF(rn); \
5f80c4cf 171} STMT_END
74750237 172
acfe0abc 173static void restore_pos(pTHX_ void *arg);
51371543 174
76e3520e 175STATIC CHECKPOINT
cea2e8a9 176S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 177{
3280af22 178 int retval = PL_savestack_ix;
b1ce53c5
JH
179#define REGCP_PAREN_ELEMS 4
180 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
181 int p;
182
e49a9654
IH
183 if (paren_elems_to_push < 0)
184 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
185
a01268b5 186#define REGCP_OTHER_ELEMS 6
4b3c1a47 187 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 188 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 189/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
190 SSPUSHINT(PL_regendp[p]);
191 SSPUSHINT(PL_regstartp[p]);
3280af22 192 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
193 SSPUSHINT(p);
194 }
b1ce53c5 195/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
196 SSPUSHINT(PL_regsize);
197 SSPUSHINT(*PL_reglastparen);
a01268b5 198 SSPUSHINT(*PL_reglastcloseparen);
3280af22 199 SSPUSHPTR(PL_reginput);
41123dfd
JH
200#define REGCP_FRAME_ELEMS 2
201/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
202 * are needed for the regexp context stack bookkeeping. */
203 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 204 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 205
a0d0e21e
LW
206 return retval;
207}
208
c277df42 209/* These are needed since we do not localize EVAL nodes: */
02db2b7b 210# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 211 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 212 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 213
02db2b7b 214# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 215 PerlIO_printf(Perl_debug_log, \
faccc32b 216 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 217 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 218
76e3520e 219STATIC char *
cea2e8a9 220S_regcppop(pTHX)
a0d0e21e 221{
b1ce53c5 222 I32 i;
a0d0e21e
LW
223 U32 paren = 0;
224 char *input;
cf93c79d 225 I32 tmps;
b1ce53c5
JH
226
227 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 228 i = SSPOPINT;
b1ce53c5
JH
229 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
230 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 231 input = (char *) SSPOPPTR;
a01268b5 232 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
233 *PL_reglastparen = SSPOPINT;
234 PL_regsize = SSPOPINT;
b1ce53c5
JH
235
236 /* Now restore the parentheses context. */
41123dfd
JH
237 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
238 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 239 paren = (U32)SSPOPINT;
3280af22 240 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
241 PL_regstartp[paren] = SSPOPINT;
242 tmps = SSPOPINT;
3280af22
NIS
243 if (paren <= *PL_reglastparen)
244 PL_regendp[paren] = tmps;
c277df42 245 DEBUG_r(
c3464db5 246 PerlIO_printf(Perl_debug_log,
b900a521 247 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 248 (UV)paren, (IV)PL_regstartp[paren],
b900a521 249 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 250 (IV)PL_regendp[paren],
3280af22 251 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 252 );
a0d0e21e 253 }
c277df42 254 DEBUG_r(
eb160463 255 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 256 PerlIO_printf(Perl_debug_log,
faccc32b
JH
257 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
258 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
259 }
260 );
daf18116 261#if 1
dafc8851
JH
262 /* It would seem that the similar code in regtry()
263 * already takes care of this, and in fact it is in
264 * a better location to since this code can #if 0-ed out
265 * but the code in regtry() is needed or otherwise tests
266 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
267 * (as of patchlevel 7877) will fail. Then again,
268 * this code seems to be necessary or otherwise
269 * building DynaLoader will fail:
270 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
271 * --jhi */
eb160463
GS
272 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
273 if ((I32)paren > PL_regsize)
cf93c79d
IZ
274 PL_regstartp[paren] = -1;
275 PL_regendp[paren] = -1;
a0d0e21e 276 }
dafc8851 277#endif
a0d0e21e
LW
278 return input;
279}
280
0f5d15d6 281STATIC char *
cea2e8a9 282S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
283{
284 I32 tmp = PL_savestack_ix;
285
286 PL_savestack_ix = ss;
287 regcppop();
288 PL_savestack_ix = tmp;
942e002e 289 return Nullch;
0f5d15d6
IZ
290}
291
292typedef struct re_cc_state
293{
294 I32 ss;
295 regnode *node;
296 struct re_cc_state *prev;
297 CURCUR *cc;
298 regexp *re;
299} re_cc_state;
300
02db2b7b 301#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 302
29d1e993
HS
303#define TRYPAREN(paren, n, input) { \
304 if (paren) { \
305 if (n) { \
306 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
307 PL_regendp[paren] = input - PL_bostr; \
308 } \
309 else \
310 PL_regendp[paren] = -1; \
311 } \
312 if (regmatch(next)) \
313 sayYES; \
314 if (paren && n) \
315 PL_regendp[paren] = -1; \
316}
317
318
a687059c 319/*
e50aee73 320 * pregexec and friends
a687059c
LW
321 */
322
323/*
c277df42 324 - pregexec - match a regexp against a string
a687059c 325 */
c277df42 326I32
864dbfa3 327Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 328 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
329/* strend: pointer to null at end of string */
330/* strbeg: real beginning of string */
331/* minend: end of match must be >=minend after stringarg. */
332/* nosave: For optimizations. */
333{
334 return
9041c2e3 335 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
336 nosave ? 0 : REXEC_COPY_STR);
337}
0f5d15d6
IZ
338
339STATIC void
cea2e8a9 340S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
341{
342 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
343#ifdef DEBUGGING
344 PL_regprogram = prog->program;
345#endif
346 PL_regnpar = prog->nparens;
9041c2e3
NIS
347 PL_regdata = prog->data;
348 PL_reg_re = prog;
0f5d15d6 349}
22e551b9 350
9041c2e3 351/*
cad2e5aa
JH
352 * Need to implement the following flags for reg_anch:
353 *
354 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
355 * USE_INTUIT_ML
356 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
357 * INTUIT_AUTORITATIVE_ML
358 * INTUIT_ONCE_NOML - Intuit can match in one location only.
359 * INTUIT_ONCE_ML
360 *
361 * Another flag for this function: SECOND_TIME (so that float substrs
362 * with giant delta may be not rechecked).
363 */
364
365/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
366
2c2d71f5 367/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
368 Otherwise, only SvCUR(sv) is used to get strbeg. */
369
370/* XXXX We assume that strpos is strbeg unless sv. */
371
6eb5f6b9
JH
372/* XXXX Some places assume that there is a fixed substring.
373 An update may be needed if optimizer marks as "INTUITable"
374 RExen without fixed substrings. Similarly, it is assumed that
375 lengths of all the strings are no more than minlen, thus they
376 cannot come from lookahead.
377 (Or minlen should take into account lookahead.) */
378
2c2d71f5
JH
379/* A failure to find a constant substring means that there is no need to make
380 an expensive call to REx engine, thus we celebrate a failure. Similarly,
381 finding a substring too deep into the string means that less calls to
30944b6d
IZ
382 regtry() should be needed.
383
384 REx compiler's optimizer found 4 possible hints:
385 a) Anchored substring;
386 b) Fixed substring;
387 c) Whether we are anchored (beginning-of-line or \G);
388 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 389 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
390 string which does not contradict any of them.
391 */
2c2d71f5 392
6eb5f6b9
JH
393/* Most of decisions we do here should have been done at compile time.
394 The nodes of the REx which we used for the search should have been
395 deleted from the finite automaton. */
396
cad2e5aa
JH
397char *
398Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
399 char *strend, U32 flags, re_scream_pos_data *data)
400{
b7953727 401 register I32 start_shift = 0;
cad2e5aa 402 /* Should be nonnegative! */
b7953727 403 register I32 end_shift = 0;
2c2d71f5
JH
404 register char *s;
405 register SV *check;
a1933d95 406 char *strbeg;
cad2e5aa 407 char *t;
33b8afdf 408 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 409 I32 ml_anch;
6eb5f6b9 410 register char *other_last = Nullch; /* other substr checked before this */
b7953727 411 char *check_at = Nullch; /* check substr found at this pos */
7fba1cd6 412 I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d
IZ
413#ifdef DEBUGGING
414 char *i_strpos = strpos;
ce333219 415 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 416#endif
a30b2f1f 417 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 418
b8d68ded
JH
419 if (prog->reganch & ROPT_UTF8) {
420 DEBUG_r(PerlIO_printf(Perl_debug_log,
421 "UTF-8 regex...\n"));
422 PL_reg_flags |= RF_utf8;
423 }
424
2a782b5b 425 DEBUG_r({
b8d68ded 426 char *s = PL_reg_match_utf8 ?
c728cb41
JH
427 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
428 strpos;
b8d68ded
JH
429 int len = PL_reg_match_utf8 ?
430 strlen(s) : strend - strpos;
2a782b5b
JH
431 if (!PL_colorset)
432 reginitcolors();
b8d68ded
JH
433 if (PL_reg_match_utf8)
434 DEBUG_r(PerlIO_printf(Perl_debug_log,
435 "UTF-8 target...\n"));
2a782b5b
JH
436 PerlIO_printf(Perl_debug_log,
437 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
438 PL_colors[4],PL_colors[5],PL_colors[0],
439 prog->precomp,
440 PL_colors[1],
441 (strlen(prog->precomp) > 60 ? "..." : ""),
442 PL_colors[0],
443 (int)(len > 60 ? 60 : len),
444 s, PL_colors[1],
445 (len > 60 ? "..." : "")
446 );
447 });
cad2e5aa 448
c344f387
JH
449 /* CHR_DIST() would be more correct here but it makes things slow. */
450 if (prog->minlen > strend - strpos) {
a72c7584
JH
451 DEBUG_r(PerlIO_printf(Perl_debug_log,
452 "String too short... [re_intuit_start]\n"));
cad2e5aa 453 goto fail;
2c2d71f5 454 }
a1933d95 455 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 456 PL_regeol = strend;
33b8afdf
JH
457 if (do_utf8) {
458 if (!prog->check_utf8 && prog->check_substr)
459 to_utf8_substr(prog);
460 check = prog->check_utf8;
461 } else {
462 if (!prog->check_substr && prog->check_utf8)
463 to_byte_substr(prog);
464 check = prog->check_substr;
465 }
466 if (check == &PL_sv_undef) {
467 DEBUG_r(PerlIO_printf(Perl_debug_log,
468 "Non-utf string cannot match utf check string\n"));
469 goto fail;
470 }
2c2d71f5 471 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
472 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
473 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 474 && !multiline ) ); /* Check after \n? */
cad2e5aa 475
7e25d62c
JH
476 if (!ml_anch) {
477 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
478 | ROPT_IMPLICIT)) /* not a real BOL */
479 /* SvCUR is not set on references: SvRV and SvPVX overlap */
480 && sv && !SvROK(sv)
481 && (strpos != strbeg)) {
482 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
483 goto fail;
484 }
485 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 486 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 487 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
488 I32 slen;
489
1aa99e6b 490 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
491 if (SvTAIL(check)) {
492 slen = SvCUR(check); /* >= 1 */
cad2e5aa 493
9041c2e3 494 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
495 || (strend - s == slen && strend[-1] != '\n')) {
496 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
497 goto fail_finish;
cad2e5aa
JH
498 }
499 /* Now should match s[0..slen-2] */
500 slen--;
653099ff 501 if (slen && (*SvPVX(check) != *s
cad2e5aa 502 || (slen > 1
653099ff 503 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
504 report_neq:
505 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
506 goto fail_finish;
507 }
cad2e5aa 508 }
653099ff
GS
509 else if (*SvPVX(check) != *s
510 || ((slen = SvCUR(check)) > 1
511 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
512 goto report_neq;
513 goto success_at_start;
7e25d62c 514 }
cad2e5aa 515 }
2c2d71f5 516 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 517 s = strpos;
2c2d71f5 518 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 519 end_shift = prog->minlen - start_shift -
653099ff 520 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 521 if (!ml_anch) {
653099ff
GS
522 I32 end = prog->check_offset_max + CHR_SVLEN(check)
523 - (SvTAIL(check) != 0);
1aa99e6b 524 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
525
526 if (end_shift < eshift)
527 end_shift = eshift;
528 }
cad2e5aa 529 }
2c2d71f5 530 else { /* Can match at random position */
cad2e5aa
JH
531 ml_anch = 0;
532 s = strpos;
2c2d71f5
JH
533 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
534 /* Should be nonnegative! */
535 end_shift = prog->minlen - start_shift -
653099ff 536 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
537 }
538
2c2d71f5 539#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 540 if (end_shift < 0)
6bbae5e6 541 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
542#endif
543
2c2d71f5
JH
544 restart:
545 /* Find a possible match in the region s..strend by looking for
546 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 547 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
548 I32 p = -1; /* Internal iterator of scream. */
549 I32 *pp = data ? data->scream_pos : &p;
550
2c2d71f5
JH
551 if (PL_screamfirst[BmRARE(check)] >= 0
552 || ( BmRARE(check) == '\n'
553 && (BmPREVIOUS(check) == SvCUR(check) - 1)
554 && SvTAIL(check) ))
9041c2e3 555 s = screaminstr(sv, check,
2c2d71f5 556 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 557 else
2c2d71f5 558 goto fail_finish;
4addbd3b
HS
559 /* we may be pointing at the wrong string */
560 if (s && RX_MATCH_COPIED(prog))
7ef91622 561 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
562 if (data)
563 *data->scream_olds = s;
564 }
f33976b4 565 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
566 s = fbm_instr((U8*)(s + start_shift),
567 (U8*)(strend - end_shift),
7fba1cd6 568 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 569 else
1aa99e6b
IH
570 s = fbm_instr(HOP3(s, start_shift, strend),
571 HOP3(strend, -end_shift, strbeg),
7fba1cd6 572 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
573
574 /* Update the count-of-usability, remove useless subpatterns,
575 unshift s. */
2c2d71f5
JH
576
577 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
578 (s ? "Found" : "Did not find"),
33b8afdf 579 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 580 PL_colors[0],
7b0972df
JH
581 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
582 SvPVX(check),
2c2d71f5
JH
583 PL_colors[1], (SvTAIL(check) ? "$" : ""),
584 (s ? " at offset " : "...\n") ) );
585
586 if (!s)
587 goto fail_finish;
588
6eb5f6b9
JH
589 check_at = s;
590
2c2d71f5 591 /* Finish the diagnostic message */
30944b6d 592 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
593
594 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
595 Start with the other substr.
596 XXXX no SCREAM optimization yet - and a very coarse implementation
597 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
598 *always* match. Probably should be marked during compile...
599 Probably it is right to do no SCREAM here...
600 */
601
33b8afdf 602 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 603 /* Take into account the "other" substring. */
2c2d71f5
JH
604 /* XXXX May be hopelessly wrong for UTF... */
605 if (!other_last)
6eb5f6b9 606 other_last = strpos;
33b8afdf 607 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
608 do_other_anchored:
609 {
1aa99e6b 610 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 611 char *s1 = s;
33b8afdf 612 SV* must;
2c2d71f5 613
2c2d71f5
JH
614 t = s - prog->check_offset_max;
615 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 616 && (!do_utf8
1aa99e6b 617 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 618 && t > strpos)))
30944b6d 619 /* EMPTY */;
2c2d71f5
JH
620 else
621 t = strpos;
1aa99e6b 622 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
623 if (t < other_last) /* These positions already checked */
624 t = other_last;
1aa99e6b 625 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
626 if (last < last1)
627 last1 = last;
628 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
629 /* On end-of-str: see comment below. */
33b8afdf
JH
630 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
631 if (must == &PL_sv_undef) {
632 s = (char*)NULL;
633 DEBUG_r(must = prog->anchored_utf8); /* for debug */
634 }
635 else
636 s = fbm_instr(
637 (unsigned char*)t,
638 HOP3(HOP3(last1, prog->anchored_offset, strend)
639 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
640 must,
7fba1cd6 641 multiline ? FBMrf_MULTILINE : 0
33b8afdf 642 );
1aa99e6b
IH
643 DEBUG_r(PerlIO_printf(Perl_debug_log,
644 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
645 (s ? "Found" : "Contradicts"),
646 PL_colors[0],
33b8afdf
JH
647 (int)(SvCUR(must)
648 - (SvTAIL(must)!=0)),
649 SvPVX(must),
650 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
651 if (!s) {
652 if (last1 >= last2) {
653 DEBUG_r(PerlIO_printf(Perl_debug_log,
654 ", giving up...\n"));
655 goto fail_finish;
656 }
657 DEBUG_r(PerlIO_printf(Perl_debug_log,
658 ", trying floating at offset %ld...\n",
1aa99e6b
IH
659 (long)(HOP3c(s1, 1, strend) - i_strpos)));
660 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
661 s = HOP3c(last, 1, strend);
2c2d71f5
JH
662 goto restart;
663 }
664 else {
665 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 666 (long)(s - i_strpos)));
1aa99e6b
IH
667 t = HOP3c(s, -prog->anchored_offset, strbeg);
668 other_last = HOP3c(s, 1, strend);
30944b6d 669 s = s1;
2c2d71f5
JH
670 if (t == strpos)
671 goto try_at_start;
2c2d71f5
JH
672 goto try_at_offset;
673 }
30944b6d 674 }
2c2d71f5
JH
675 }
676 else { /* Take into account the floating substring. */
33b8afdf
JH
677 char *last, *last1;
678 char *s1 = s;
679 SV* must;
680
681 t = HOP3c(s, -start_shift, strbeg);
682 last1 = last =
683 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
684 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
685 last = HOP3c(t, prog->float_max_offset, strend);
686 s = HOP3c(t, prog->float_min_offset, strend);
687 if (s < other_last)
688 s = other_last;
2c2d71f5 689 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
690 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
691 /* fbm_instr() takes into account exact value of end-of-str
692 if the check is SvTAIL(ed). Since false positives are OK,
693 and end-of-str is not later than strend we are OK. */
694 if (must == &PL_sv_undef) {
695 s = (char*)NULL;
696 DEBUG_r(must = prog->float_utf8); /* for debug message */
697 }
698 else
2c2d71f5 699 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
700 (unsigned char*)last + SvCUR(must)
701 - (SvTAIL(must)!=0),
7fba1cd6 702 must, multiline ? FBMrf_MULTILINE : 0);
33b8afdf
JH
703 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
704 (s ? "Found" : "Contradicts"),
705 PL_colors[0],
706 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
707 SvPVX(must),
708 PL_colors[1], (SvTAIL(must) ? "$" : "")));
709 if (!s) {
710 if (last1 == last) {
2c2d71f5 711 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
712 ", giving up...\n"));
713 goto fail_finish;
2c2d71f5 714 }
33b8afdf
JH
715 DEBUG_r(PerlIO_printf(Perl_debug_log,
716 ", trying anchored starting at offset %ld...\n",
717 (long)(s1 + 1 - i_strpos)));
718 other_last = last;
719 s = HOP3c(t, 1, strend);
720 goto restart;
721 }
722 else {
723 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
724 (long)(s - i_strpos)));
725 other_last = s; /* Fix this later. --Hugo */
726 s = s1;
727 if (t == strpos)
728 goto try_at_start;
729 goto try_at_offset;
730 }
2c2d71f5 731 }
cad2e5aa 732 }
2c2d71f5
JH
733
734 t = s - prog->check_offset_max;
2c2d71f5 735 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 736 && (!do_utf8
1aa99e6b
IH
737 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
738 && t > strpos))) {
2c2d71f5
JH
739 /* Fixed substring is found far enough so that the match
740 cannot start at strpos. */
741 try_at_offset:
cad2e5aa 742 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
743 /* Eventually fbm_*() should handle this, but often
744 anchored_offset is not 0, so this check will not be wasted. */
745 /* XXXX In the code below we prefer to look for "^" even in
746 presence of anchored substrings. And we search even
747 beyond the found float position. These pessimizations
748 are historical artefacts only. */
749 find_anchor:
2c2d71f5 750 while (t < strend - prog->minlen) {
cad2e5aa 751 if (*t == '\n') {
4ee3650e 752 if (t < check_at - prog->check_offset_min) {
33b8afdf 753 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
754 /* Since we moved from the found position,
755 we definitely contradict the found anchored
30944b6d
IZ
756 substr. Due to the above check we do not
757 contradict "check" substr.
758 Thus we can arrive here only if check substr
759 is float. Redo checking for "other"=="fixed".
760 */
9041c2e3 761 strpos = t + 1;
30944b6d
IZ
762 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
763 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
764 goto do_other_anchored;
765 }
4ee3650e
GS
766 /* We don't contradict the found floating substring. */
767 /* XXXX Why not check for STCLASS? */
cad2e5aa 768 s = t + 1;
2c2d71f5 769 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 770 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
771 goto set_useful;
772 }
4ee3650e
GS
773 /* Position contradicts check-string */
774 /* XXXX probably better to look for check-string
775 than for "\n", so one should lower the limit for t? */
776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 777 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 778 other_last = strpos = s = t + 1;
cad2e5aa
JH
779 goto restart;
780 }
781 t++;
782 }
2c2d71f5
JH
783 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
784 PL_colors[0],PL_colors[1]));
785 goto fail_finish;
cad2e5aa 786 }
f5952150
GS
787 else {
788 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
789 PL_colors[0],PL_colors[1]));
790 }
cad2e5aa
JH
791 s = t;
792 set_useful:
33b8afdf 793 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
794 }
795 else {
f5952150 796 /* The found string does not prohibit matching at strpos,
2c2d71f5 797 - no optimization of calling REx engine can be performed,
f5952150
GS
798 unless it was an MBOL and we are not after MBOL,
799 or a future STCLASS check will fail this. */
2c2d71f5
JH
800 try_at_start:
801 /* Even in this situation we may use MBOL flag if strpos is offset
802 wrt the start of the string. */
05b4157f 803 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 804 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
805 /* May be due to an implicit anchor of m{.*foo} */
806 && !(prog->reganch & ROPT_IMPLICIT))
807 {
cad2e5aa
JH
808 t = strpos;
809 goto find_anchor;
810 }
30944b6d 811 DEBUG_r( if (ml_anch)
f5952150
GS
812 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
813 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 814 );
2c2d71f5 815 success_at_start:
30944b6d 816 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
817 && (do_utf8 ? (
818 prog->check_utf8 /* Could be deleted already */
819 && --BmUSEFUL(prog->check_utf8) < 0
820 && (prog->check_utf8 == prog->float_utf8)
821 ) : (
822 prog->check_substr /* Could be deleted already */
823 && --BmUSEFUL(prog->check_substr) < 0
824 && (prog->check_substr == prog->float_substr)
825 )))
66e933ab 826 {
cad2e5aa 827 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 828 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
829 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
830 if (do_utf8 ? prog->check_substr : prog->check_utf8)
831 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
832 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
833 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 834 check = Nullsv; /* abort */
cad2e5aa 835 s = strpos;
3cf5c195
IZ
836 /* XXXX This is a remnant of the old implementation. It
837 looks wasteful, since now INTUIT can use many
6eb5f6b9 838 other heuristics. */
cad2e5aa
JH
839 prog->reganch &= ~RE_USE_INTUIT;
840 }
841 else
842 s = strpos;
843 }
844
6eb5f6b9
JH
845 /* Last resort... */
846 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
847 if (prog->regstclass) {
848 /* minlen == 0 is possible if regstclass is \b or \B,
849 and the fixed substr is ''$.
850 Since minlen is already taken into account, s+1 is before strend;
851 accidentally, minlen >= 1 guaranties no false positives at s + 1
852 even for \b or \B. But (minlen? 1 : 0) below assumes that
853 regstclass does not come from lookahead... */
854 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
855 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 856 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 857 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 858 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 859 : 1);
33b8afdf 860 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 861 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 862 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
863 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
864 cl_l, strend)
865 : strend);
a1933d95 866 char *startpos = strbeg;
6eb5f6b9
JH
867
868 t = s;
9926ca43 869 cache_re(prog);
f33976b4 870 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
871 if (!s) {
872#ifdef DEBUGGING
b7953727 873 char *what = 0;
6eb5f6b9
JH
874#endif
875 if (endpos == strend) {
876 DEBUG_r( PerlIO_printf(Perl_debug_log,
877 "Could not match STCLASS...\n") );
878 goto fail;
879 }
66e933ab
GS
880 DEBUG_r( PerlIO_printf(Perl_debug_log,
881 "This position contradicts STCLASS...\n") );
653099ff
GS
882 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
883 goto fail;
6eb5f6b9 884 /* Contradict one of substrings */
33b8afdf
JH
885 if (prog->anchored_substr || prog->anchored_utf8) {
886 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
887 DEBUG_r( what = "anchored" );
888 hop_and_restart:
1aa99e6b 889 s = HOP3c(t, 1, strend);
66e933ab
GS
890 if (s + start_shift + end_shift > strend) {
891 /* XXXX Should be taken into account earlier? */
892 DEBUG_r( PerlIO_printf(Perl_debug_log,
893 "Could not match STCLASS...\n") );
894 goto fail;
895 }
5e39e1e5
HS
896 if (!check)
897 goto giveup;
6eb5f6b9 898 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 899 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
900 what, (long)(s + start_shift - i_strpos)) );
901 goto restart;
902 }
66e933ab 903 /* Have both, check_string is floating */
6eb5f6b9
JH
904 if (t + start_shift >= check_at) /* Contradicts floating=check */
905 goto retry_floating_check;
906 /* Recheck anchored substring, but not floating... */
9041c2e3 907 s = check_at;
5e39e1e5
HS
908 if (!check)
909 goto giveup;
6eb5f6b9 910 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 911 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
912 (long)(other_last - i_strpos)) );
913 goto do_other_anchored;
914 }
60e71179
GS
915 /* Another way we could have checked stclass at the
916 current position only: */
917 if (ml_anch) {
918 s = t = t + 1;
5e39e1e5
HS
919 if (!check)
920 goto giveup;
60e71179 921 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
922 "Looking for /%s^%s/m starting at offset %ld...\n",
923 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 924 goto try_at_offset;
66e933ab 925 }
33b8afdf 926 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 927 goto fail;
6eb5f6b9
JH
928 /* Check is floating subtring. */
929 retry_floating_check:
930 t = check_at - start_shift;
931 DEBUG_r( what = "floating" );
932 goto hop_and_restart;
933 }
b7953727
JH
934 if (t != s) {
935 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 936 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
937 (long)(t - i_strpos), (long)(s - i_strpos))
938 );
939 }
940 else {
941 DEBUG_r(PerlIO_printf(Perl_debug_log,
942 "Does not contradict STCLASS...\n");
943 );
944 }
6eb5f6b9 945 }
5e39e1e5
HS
946 giveup:
947 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
948 PL_colors[4], (check ? "Guessed" : "Giving up"),
949 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 950 return s;
2c2d71f5
JH
951
952 fail_finish: /* Substring not found */
33b8afdf
JH
953 if (prog->check_substr || prog->check_utf8) /* could be removed already */
954 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 955 fail:
2c2d71f5 956 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
957 PL_colors[4],PL_colors[5]));
958 return Nullch;
959}
9661b544 960
6eb5f6b9 961/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
962STATIC char *
963S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 964{
6eb5f6b9
JH
965 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
966 char *m;
d8093b23 967 STRLEN ln;
5dab1207 968 STRLEN lnc;
078c425b 969 register STRLEN uskip;
d8093b23
G
970 unsigned int c1;
971 unsigned int c2;
6eb5f6b9
JH
972 char *e;
973 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 974 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 975
6eb5f6b9
JH
976 /* We know what class it must start with. */
977 switch (OP(c)) {
6eb5f6b9 978 case ANYOF:
388cc4de 979 if (do_utf8) {
078c425b 980 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
981 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
982 !UTF8_IS_INVARIANT((U8)s[0]) ?
983 reginclass(c, (U8*)s, 0, do_utf8) :
984 REGINCLASS(c, (U8*)s)) {
985 if (tmp && (norun || regtry(prog, s)))
986 goto got_it;
987 else
988 tmp = doevery;
989 }
990 else
991 tmp = 1;
078c425b 992 s += uskip;
388cc4de
HS
993 }
994 }
995 else {
996 while (s < strend) {
997 STRLEN skip = 1;
998
999 if (REGINCLASS(c, (U8*)s) ||
1000 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1001 /* The assignment of 2 is intentional:
1002 * for the folded sharp s, the skip is 2. */
1003 (skip = SHARP_S_SKIP))) {
1004 if (tmp && (norun || regtry(prog, s)))
1005 goto got_it;
1006 else
1007 tmp = doevery;
1008 }
1009 else
1010 tmp = 1;
1011 s += skip;
1012 }
a0d0e21e 1013 }
6eb5f6b9 1014 break;
f33976b4
DB
1015 case CANY:
1016 while (s < strend) {
1017 if (tmp && (norun || regtry(prog, s)))
1018 goto got_it;
1019 else
1020 tmp = doevery;
1021 s++;
1022 }
1023 break;
6eb5f6b9 1024 case EXACTF:
5dab1207
NIS
1025 m = STRING(c);
1026 ln = STR_LEN(c); /* length to match in octets/bytes */
1027 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1028 if (UTF) {
a2a2844f 1029 STRLEN ulen1, ulen2;
5dab1207 1030 U8 *sm = (U8 *) m;
e7ae6809
JH
1031 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1032 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1033
1034 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1035 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1036
872c91ae
JH
1037 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1038 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1039 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1040 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1041 lnc = 0;
1042 while (sm < ((U8 *) m + ln)) {
1043 lnc++;
1044 sm += UTF8SKIP(sm);
1045 }
1aa99e6b
IH
1046 }
1047 else {
1048 c1 = *(U8*)m;
1049 c2 = PL_fold[c1];
1050 }
6eb5f6b9
JH
1051 goto do_exactf;
1052 case EXACTFL:
5dab1207
NIS
1053 m = STRING(c);
1054 ln = STR_LEN(c);
1055 lnc = (I32) ln;
d8093b23 1056 c1 = *(U8*)m;
6eb5f6b9
JH
1057 c2 = PL_fold_locale[c1];
1058 do_exactf:
db12adc6 1059 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1060
6eb5f6b9
JH
1061 if (norun && e < s)
1062 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1063
60a8b682
JH
1064 /* The idea in the EXACTF* cases is to first find the
1065 * first character of the EXACTF* node and then, if
1066 * necessary, case-insensitively compare the full
1067 * text of the node. The c1 and c2 are the first
1068 * characters (though in Unicode it gets a bit
1069 * more complicated because there are more cases
7f16dd3d
JH
1070 * than just upper and lower: one needs to use
1071 * the so-called folding case for case-insensitive
1072 * matching (called "loose matching" in Unicode).
1073 * ibcmp_utf8() will do just that. */
60a8b682 1074
1aa99e6b 1075 if (do_utf8) {
575cac57
JH
1076 UV c, f;
1077 U8 tmpbuf [UTF8_MAXLEN+1];
1078 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1079 STRLEN len, foldlen;
d7f013c8 1080
09091399 1081 if (c1 == c2) {
5dab1207
NIS
1082 /* Upper and lower of 1st char are equal -
1083 * probably not a "letter". */
1aa99e6b 1084 while (s <= e) {
872c91ae
JH
1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1086 ckWARN(WARN_UTF8) ?
1087 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1088 if ( c == c1
1089 && (ln == len ||
66423254 1090 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1091 m, (char **)0, ln, (bool)UTF))
55da9344 1092 && (norun || regtry(prog, s)) )
1aa99e6b 1093 goto got_it;
80aecb99
JH
1094 else {
1095 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1096 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1097 if ( f != c
1098 && (f == c1 || f == c2)
1099 && (ln == foldlen ||
66423254
JH
1100 !ibcmp_utf8((char *) foldbuf,
1101 (char **)0, foldlen, do_utf8,
d07ddd77 1102 m,
eb160463 1103 (char **)0, ln, (bool)UTF))
80aecb99
JH
1104 && (norun || regtry(prog, s)) )
1105 goto got_it;
1106 }
1aa99e6b
IH
1107 s += len;
1108 }
09091399
JH
1109 }
1110 else {
1aa99e6b 1111 while (s <= e) {
872c91ae
JH
1112 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1113 ckWARN(WARN_UTF8) ?
1114 0 : UTF8_ALLOW_ANY);
80aecb99 1115
60a8b682 1116 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1117 * Note that not all the possible combinations
1118 * are handled here: some of them are handled
1119 * by the standard folding rules, and some of
1120 * them (the character class or ANYOF cases)
1121 * are handled during compiletime in
1122 * regexec.c:S_regclass(). */
880bd946
JH
1123 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1124 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1125 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1126
1127 if ( (c == c1 || c == c2)
1128 && (ln == len ||
66423254 1129 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1130 m, (char **)0, ln, (bool)UTF))
55da9344 1131 && (norun || regtry(prog, s)) )
1aa99e6b 1132 goto got_it;
80aecb99
JH
1133 else {
1134 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1135 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1136 if ( f != c
1137 && (f == c1 || f == c2)
1138 && (ln == foldlen ||
a6872d42 1139 !ibcmp_utf8((char *) foldbuf,
66423254 1140 (char **)0, foldlen, do_utf8,
d07ddd77 1141 m,
eb160463 1142 (char **)0, ln, (bool)UTF))
80aecb99
JH
1143 && (norun || regtry(prog, s)) )
1144 goto got_it;
1145 }
1aa99e6b
IH
1146 s += len;
1147 }
09091399 1148 }
1aa99e6b
IH
1149 }
1150 else {
1151 if (c1 == c2)
1152 while (s <= e) {
1153 if ( *(U8*)s == c1
1154 && (ln == 1 || !(OP(c) == EXACTF
1155 ? ibcmp(s, m, ln)
1156 : ibcmp_locale(s, m, ln)))
1157 && (norun || regtry(prog, s)) )
1158 goto got_it;
1159 s++;
1160 }
1161 else
1162 while (s <= e) {
1163 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1164 && (ln == 1 || !(OP(c) == EXACTF
1165 ? ibcmp(s, m, ln)
1166 : ibcmp_locale(s, m, ln)))
1167 && (norun || regtry(prog, s)) )
1168 goto got_it;
1169 s++;
1170 }
b3c9acc1
IZ
1171 }
1172 break;
bbce6d69 1173 case BOUNDL:
3280af22 1174 PL_reg_flags |= RF_tainted;
bbce6d69 1175 /* FALL THROUGH */
a0d0e21e 1176 case BOUND:
ffc61ed2 1177 if (do_utf8) {
12d33761 1178 if (s == PL_bostr)
ffc61ed2
JH
1179 tmp = '\n';
1180 else {
b4f7163a 1181 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1182
b4f7163a 1183 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1184 }
1185 tmp = ((OP(c) == BOUND ?
9041c2e3 1186 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1187 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1188 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1189 if (tmp == !(OP(c) == BOUND ?
3568d838 1190 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1191 isALNUM_LC_utf8((U8*)s)))
1192 {
1193 tmp = !tmp;
1194 if ((norun || regtry(prog, s)))
1195 goto got_it;
1196 }
078c425b 1197 s += uskip;
a687059c 1198 }
a0d0e21e 1199 }
667bb95a 1200 else {
12d33761 1201 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1202 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1203 while (s < strend) {
1204 if (tmp ==
1205 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1206 tmp = !tmp;
1207 if ((norun || regtry(prog, s)))
1208 goto got_it;
1209 }
1210 s++;
a0ed51b3 1211 }
a0ed51b3 1212 }
6eb5f6b9 1213 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1214 goto got_it;
1215 break;
bbce6d69 1216 case NBOUNDL:
3280af22 1217 PL_reg_flags |= RF_tainted;
bbce6d69 1218 /* FALL THROUGH */
a0d0e21e 1219 case NBOUND:
ffc61ed2 1220 if (do_utf8) {
12d33761 1221 if (s == PL_bostr)
ffc61ed2
JH
1222 tmp = '\n';
1223 else {
b4f7163a 1224 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1225
b4f7163a 1226 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1227 }
1228 tmp = ((OP(c) == NBOUND ?
9041c2e3 1229 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1230 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1231 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1232 if (tmp == !(OP(c) == NBOUND ?
3568d838 1233 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1234 isALNUM_LC_utf8((U8*)s)))
1235 tmp = !tmp;
1236 else if ((norun || regtry(prog, s)))
1237 goto got_it;
078c425b 1238 s += uskip;
ffc61ed2 1239 }
a0d0e21e 1240 }
667bb95a 1241 else {
12d33761 1242 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1243 tmp = ((OP(c) == NBOUND ?
1244 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1245 while (s < strend) {
1246 if (tmp ==
1247 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1248 tmp = !tmp;
1249 else if ((norun || regtry(prog, s)))
1250 goto got_it;
1251 s++;
1252 }
a0ed51b3 1253 }
6eb5f6b9 1254 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1255 goto got_it;
1256 break;
a0d0e21e 1257 case ALNUM:
ffc61ed2 1258 if (do_utf8) {
8269fa76 1259 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1260 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1261 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1262 if (tmp && (norun || regtry(prog, s)))
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
bbce6d69 1267 else
ffc61ed2 1268 tmp = 1;
078c425b 1269 s += uskip;
bbce6d69 1270 }
bbce6d69 1271 }
ffc61ed2
JH
1272 else {
1273 while (s < strend) {
1274 if (isALNUM(*s)) {
1275 if (tmp && (norun || regtry(prog, s)))
1276 goto got_it;
1277 else
1278 tmp = doevery;
1279 }
a0ed51b3 1280 else
ffc61ed2
JH
1281 tmp = 1;
1282 s++;
a0ed51b3 1283 }
a0ed51b3
LW
1284 }
1285 break;
bbce6d69 1286 case ALNUML:
3280af22 1287 PL_reg_flags |= RF_tainted;
ffc61ed2 1288 if (do_utf8) {
078c425b 1289 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1290 if (isALNUM_LC_utf8((U8*)s)) {
1291 if (tmp && (norun || regtry(prog, s)))
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
a687059c 1296 else
ffc61ed2 1297 tmp = 1;
078c425b 1298 s += uskip;
a0d0e21e 1299 }
a0d0e21e 1300 }
ffc61ed2
JH
1301 else {
1302 while (s < strend) {
1303 if (isALNUM_LC(*s)) {
1304 if (tmp && (norun || regtry(prog, s)))
1305 goto got_it;
1306 else
1307 tmp = doevery;
1308 }
a0ed51b3 1309 else
ffc61ed2
JH
1310 tmp = 1;
1311 s++;
a0ed51b3 1312 }
a0ed51b3
LW
1313 }
1314 break;
a0d0e21e 1315 case NALNUM:
ffc61ed2 1316 if (do_utf8) {
8269fa76 1317 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1318 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1319 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1320 if (tmp && (norun || regtry(prog, s)))
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
bbce6d69 1325 else
ffc61ed2 1326 tmp = 1;
078c425b 1327 s += uskip;
bbce6d69 1328 }
bbce6d69 1329 }
ffc61ed2
JH
1330 else {
1331 while (s < strend) {
1332 if (!isALNUM(*s)) {
1333 if (tmp && (norun || regtry(prog, s)))
1334 goto got_it;
1335 else
1336 tmp = doevery;
1337 }
a0ed51b3 1338 else
ffc61ed2
JH
1339 tmp = 1;
1340 s++;
a0ed51b3 1341 }
a0ed51b3
LW
1342 }
1343 break;
bbce6d69 1344 case NALNUML:
3280af22 1345 PL_reg_flags |= RF_tainted;
ffc61ed2 1346 if (do_utf8) {
078c425b 1347 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1348 if (!isALNUM_LC_utf8((U8*)s)) {
1349 if (tmp && (norun || regtry(prog, s)))
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a687059c 1354 else
ffc61ed2 1355 tmp = 1;
078c425b 1356 s += uskip;
a687059c 1357 }
a0d0e21e 1358 }
ffc61ed2
JH
1359 else {
1360 while (s < strend) {
1361 if (!isALNUM_LC(*s)) {
1362 if (tmp && (norun || regtry(prog, s)))
1363 goto got_it;
1364 else
1365 tmp = doevery;
1366 }
a0ed51b3 1367 else
ffc61ed2
JH
1368 tmp = 1;
1369 s++;
a0ed51b3 1370 }
a0ed51b3
LW
1371 }
1372 break;
a0d0e21e 1373 case SPACE:
ffc61ed2 1374 if (do_utf8) {
8269fa76 1375 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1376 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1377 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1378 if (tmp && (norun || regtry(prog, s)))
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
a0d0e21e 1383 else
ffc61ed2 1384 tmp = 1;
078c425b 1385 s += uskip;
2304df62 1386 }
a0d0e21e 1387 }
ffc61ed2
JH
1388 else {
1389 while (s < strend) {
1390 if (isSPACE(*s)) {
1391 if (tmp && (norun || regtry(prog, s)))
1392 goto got_it;
1393 else
1394 tmp = doevery;
1395 }
a0ed51b3 1396 else
ffc61ed2
JH
1397 tmp = 1;
1398 s++;
a0ed51b3 1399 }
a0ed51b3
LW
1400 }
1401 break;
bbce6d69 1402 case SPACEL:
3280af22 1403 PL_reg_flags |= RF_tainted;
ffc61ed2 1404 if (do_utf8) {
078c425b 1405 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1406 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1407 if (tmp && (norun || regtry(prog, s)))
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
bbce6d69 1412 else
ffc61ed2 1413 tmp = 1;
078c425b 1414 s += uskip;
bbce6d69 1415 }
bbce6d69 1416 }
ffc61ed2
JH
1417 else {
1418 while (s < strend) {
1419 if (isSPACE_LC(*s)) {
1420 if (tmp && (norun || regtry(prog, s)))
1421 goto got_it;
1422 else
1423 tmp = doevery;
1424 }
a0ed51b3 1425 else
ffc61ed2
JH
1426 tmp = 1;
1427 s++;
a0ed51b3 1428 }
a0ed51b3
LW
1429 }
1430 break;
a0d0e21e 1431 case NSPACE:
ffc61ed2 1432 if (do_utf8) {
8269fa76 1433 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1434 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1435 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1436 if (tmp && (norun || regtry(prog, s)))
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
a0d0e21e 1441 else
ffc61ed2 1442 tmp = 1;
078c425b 1443 s += uskip;
a687059c 1444 }
a0d0e21e 1445 }
ffc61ed2
JH
1446 else {
1447 while (s < strend) {
1448 if (!isSPACE(*s)) {
1449 if (tmp && (norun || regtry(prog, s)))
1450 goto got_it;
1451 else
1452 tmp = doevery;
1453 }
a0ed51b3 1454 else
ffc61ed2
JH
1455 tmp = 1;
1456 s++;
a0ed51b3 1457 }
a0ed51b3
LW
1458 }
1459 break;
bbce6d69 1460 case NSPACEL:
3280af22 1461 PL_reg_flags |= RF_tainted;
ffc61ed2 1462 if (do_utf8) {
078c425b 1463 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1464 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1465 if (tmp && (norun || regtry(prog, s)))
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
bbce6d69 1470 else
ffc61ed2 1471 tmp = 1;
078c425b 1472 s += uskip;
bbce6d69 1473 }
bbce6d69 1474 }
ffc61ed2
JH
1475 else {
1476 while (s < strend) {
1477 if (!isSPACE_LC(*s)) {
1478 if (tmp && (norun || regtry(prog, s)))
1479 goto got_it;
1480 else
1481 tmp = doevery;
1482 }
a0ed51b3 1483 else
ffc61ed2
JH
1484 tmp = 1;
1485 s++;
a0ed51b3 1486 }
a0ed51b3
LW
1487 }
1488 break;
a0d0e21e 1489 case DIGIT:
ffc61ed2 1490 if (do_utf8) {
8269fa76 1491 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1492 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1493 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1494 if (tmp && (norun || regtry(prog, s)))
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
a0d0e21e 1499 else
ffc61ed2 1500 tmp = 1;
078c425b 1501 s += uskip;
2b69d0c2 1502 }
a0d0e21e 1503 }
ffc61ed2
JH
1504 else {
1505 while (s < strend) {
1506 if (isDIGIT(*s)) {
1507 if (tmp && (norun || regtry(prog, s)))
1508 goto got_it;
1509 else
1510 tmp = doevery;
1511 }
a0ed51b3 1512 else
ffc61ed2
JH
1513 tmp = 1;
1514 s++;
a0ed51b3 1515 }
a0ed51b3
LW
1516 }
1517 break;
b8c5462f
JH
1518 case DIGITL:
1519 PL_reg_flags |= RF_tainted;
ffc61ed2 1520 if (do_utf8) {
078c425b 1521 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1522 if (isDIGIT_LC_utf8((U8*)s)) {
1523 if (tmp && (norun || regtry(prog, s)))
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
b8c5462f 1528 else
ffc61ed2 1529 tmp = 1;
078c425b 1530 s += uskip;
b8c5462f 1531 }
b8c5462f 1532 }
ffc61ed2
JH
1533 else {
1534 while (s < strend) {
1535 if (isDIGIT_LC(*s)) {
1536 if (tmp && (norun || regtry(prog, s)))
1537 goto got_it;
1538 else
1539 tmp = doevery;
1540 }
b8c5462f 1541 else
ffc61ed2
JH
1542 tmp = 1;
1543 s++;
b8c5462f 1544 }
b8c5462f
JH
1545 }
1546 break;
a0d0e21e 1547 case NDIGIT:
ffc61ed2 1548 if (do_utf8) {
8269fa76 1549 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1550 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1551 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1552 if (tmp && (norun || regtry(prog, s)))
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
a0d0e21e 1557 else
ffc61ed2 1558 tmp = 1;
078c425b 1559 s += uskip;
a687059c 1560 }
a0d0e21e 1561 }
ffc61ed2
JH
1562 else {
1563 while (s < strend) {
1564 if (!isDIGIT(*s)) {
1565 if (tmp && (norun || regtry(prog, s)))
1566 goto got_it;
1567 else
1568 tmp = doevery;
1569 }
a0ed51b3 1570 else
ffc61ed2
JH
1571 tmp = 1;
1572 s++;
a0ed51b3 1573 }
a0ed51b3
LW
1574 }
1575 break;
b8c5462f
JH
1576 case NDIGITL:
1577 PL_reg_flags |= RF_tainted;
ffc61ed2 1578 if (do_utf8) {
078c425b 1579 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1580 if (!isDIGIT_LC_utf8((U8*)s)) {
1581 if (tmp && (norun || regtry(prog, s)))
1582 goto got_it;
1583 else
1584 tmp = doevery;
1585 }
b8c5462f 1586 else
ffc61ed2 1587 tmp = 1;
078c425b 1588 s += uskip;
b8c5462f 1589 }
a0ed51b3 1590 }
ffc61ed2
JH
1591 else {
1592 while (s < strend) {
1593 if (!isDIGIT_LC(*s)) {
1594 if (tmp && (norun || regtry(prog, s)))
1595 goto got_it;
1596 else
1597 tmp = doevery;
1598 }
cf93c79d 1599 else
ffc61ed2
JH
1600 tmp = 1;
1601 s++;
b8c5462f 1602 }
b8c5462f
JH
1603 }
1604 break;
b3c9acc1 1605 default:
3c3eec57
GS
1606 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1607 break;
d6a28714 1608 }
6eb5f6b9
JH
1609 return 0;
1610 got_it:
1611 return s;
1612}
1613
1614/*
1615 - regexec_flags - match a regexp against a string
1616 */
1617I32
1618Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1619 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1620/* strend: pointer to null at end of string */
1621/* strbeg: real beginning of string */
1622/* minend: end of match must be >=minend after stringarg. */
1623/* data: May be used for some additional optimizations. */
1624/* nosave: For optimizations. */
1625{
6eb5f6b9
JH
1626 register char *s;
1627 register regnode *c;
1628 register char *startpos = stringarg;
6eb5f6b9
JH
1629 I32 minlen; /* must match at least this many chars */
1630 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1631 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1632 constant substr. */ /* CC */
1633 I32 end_shift = 0; /* Same for the end. */ /* CC */
1634 I32 scream_pos = -1; /* Internal iterator of scream. */
1635 char *scream_olds;
1636 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1637 bool do_utf8 = DO_UTF8(sv);
7fba1cd6 1638 I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1639#ifdef DEBUGGING
9e55ce06
JH
1640 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1641 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1642#endif
a30b2f1f 1643 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1644
1645 PL_regcc = 0;
1646
1647 cache_re(prog);
1648#ifdef DEBUGGING
aea4f609 1649 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1650#endif
1651
1652 /* Be paranoid... */
1653 if (prog == NULL || startpos == NULL) {
1654 Perl_croak(aTHX_ "NULL regexp parameter");
1655 return 0;
1656 }
1657
1658 minlen = prog->minlen;
61a36c01 1659 if (strend - startpos < minlen) {
a72c7584
JH
1660 DEBUG_r(PerlIO_printf(Perl_debug_log,
1661 "String too short [regexec_flags]...\n"));
1662 goto phooey;
1aa99e6b 1663 }
6eb5f6b9 1664
6eb5f6b9
JH
1665 /* Check validity of program. */
1666 if (UCHARAT(prog->program) != REG_MAGIC) {
1667 Perl_croak(aTHX_ "corrupted regexp program");
1668 }
1669
1670 PL_reg_flags = 0;
1671 PL_reg_eval_set = 0;
1672 PL_reg_maxiter = 0;
1673
1674 if (prog->reganch & ROPT_UTF8)
1675 PL_reg_flags |= RF_utf8;
1676
1677 /* Mark beginning of line for ^ and lookbehind. */
1678 PL_regbol = startpos;
1679 PL_bostr = strbeg;
1680 PL_reg_sv = sv;
1681
1682 /* Mark end of line for $ (and such) */
1683 PL_regeol = strend;
1684
1685 /* see how far we have to get to not match where we matched before */
1686 PL_regtill = startpos+minend;
1687
1688 /* We start without call_cc context. */
1689 PL_reg_call_cc = 0;
1690
1691 /* If there is a "must appear" string, look for it. */
1692 s = startpos;
1693
1694 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1695 MAGIC *mg;
1696
1697 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1698 PL_reg_ganch = startpos;
1699 else if (sv && SvTYPE(sv) >= SVt_PVMG
1700 && SvMAGIC(sv)
14befaf4
DM
1701 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1702 && mg->mg_len >= 0) {
6eb5f6b9
JH
1703 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1704 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1705 if (s > PL_reg_ganch)
6eb5f6b9
JH
1706 goto phooey;
1707 s = PL_reg_ganch;
1708 }
1709 }
1710 else /* pos() not defined */
1711 PL_reg_ganch = strbeg;
1712 }
1713
33b8afdf 1714 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1715 re_scream_pos_data d;
1716
1717 d.scream_olds = &scream_olds;
1718 d.scream_pos = &scream_pos;
1719 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1720 if (!s) {
1721 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1722 goto phooey; /* not present */
3fa9c3d7 1723 }
6eb5f6b9
JH
1724 }
1725
2a782b5b 1726 DEBUG_r({
9e55ce06
JH
1727 char *s0 = UTF ?
1728 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1729 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1730 prog->precomp;
1731 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1732 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1733 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1734 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1735 if (!PL_colorset)
1736 reginitcolors();
1737 PerlIO_printf(Perl_debug_log,
9e55ce06 1738 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1739 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1740 len0, len0, s0,
2a782b5b 1741 PL_colors[1],
9e55ce06 1742 len0 > 60 ? "..." : "",
2a782b5b 1743 PL_colors[0],
9e55ce06
JH
1744 (int)(len1 > 60 ? 60 : len1),
1745 s1, PL_colors[1],
1746 (len1 > 60 ? "..." : "")
2a782b5b
JH
1747 );
1748 });
6eb5f6b9
JH
1749
1750 /* Simplest case: anchored match need be tried only once. */
1751 /* [unless only anchor is BOL and multiline is set] */
1752 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1753 if (s == startpos && regtry(prog, startpos))
1754 goto got_it;
7fba1cd6 1755 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1756 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1757 {
1758 char *end;
1759
1760 if (minlen)
1761 dontbother = minlen - 1;
1aa99e6b 1762 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1763 /* for multiline we only have to try after newlines */
33b8afdf 1764 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1765 if (s == startpos)
1766 goto after_try;
1767 while (1) {
1768 if (regtry(prog, s))
1769 goto got_it;
1770 after_try:
1771 if (s >= end)
1772 goto phooey;
1773 if (prog->reganch & RE_USE_INTUIT) {
1774 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1775 if (!s)
1776 goto phooey;
1777 }
1778 else
1779 s++;
1780 }
1781 } else {
1782 if (s > startpos)
1783 s--;
1784 while (s < end) {
1785 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1786 if (regtry(prog, s))
1787 goto got_it;
1788 }
1789 }
1790 }
1791 }
1792 goto phooey;
1793 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1794 if (regtry(prog, PL_reg_ganch))
1795 goto got_it;
1796 goto phooey;
1797 }
1798
1799 /* Messy cases: unanchored match. */
33b8afdf 1800 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1801 /* we have /x+whatever/ */
1802 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1803 char ch;
bf93d4cc
GS
1804#ifdef DEBUGGING
1805 int did_match = 0;
1806#endif
33b8afdf
JH
1807 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1808 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1809 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1810
1aa99e6b 1811 if (do_utf8) {
6eb5f6b9
JH
1812 while (s < strend) {
1813 if (*s == ch) {
bf93d4cc 1814 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1815 if (regtry(prog, s)) goto got_it;
1816 s += UTF8SKIP(s);
1817 while (s < strend && *s == ch)
1818 s += UTF8SKIP(s);
1819 }
1820 s += UTF8SKIP(s);
1821 }
1822 }
1823 else {
1824 while (s < strend) {
1825 if (*s == ch) {
bf93d4cc 1826 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1827 if (regtry(prog, s)) goto got_it;
1828 s++;
1829 while (s < strend && *s == ch)
1830 s++;
1831 }
1832 s++;
1833 }
1834 }
b7953727 1835 DEBUG_r(if (!did_match)
bf93d4cc 1836 PerlIO_printf(Perl_debug_log,
b7953727
JH
1837 "Did not find anchored character...\n")
1838 );
6eb5f6b9
JH
1839 }
1840 /*SUPPRESS 560*/
33b8afdf
JH
1841 else if (prog->anchored_substr != Nullsv
1842 || prog->anchored_utf8 != Nullsv
1843 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1844 && prog->float_max_offset < strend - s)) {
1845 SV *must;
1846 I32 back_max;
1847 I32 back_min;
1848 char *last;
6eb5f6b9 1849 char *last1; /* Last position checked before */
bf93d4cc
GS
1850#ifdef DEBUGGING
1851 int did_match = 0;
1852#endif
33b8afdf
JH
1853 if (prog->anchored_substr || prog->anchored_utf8) {
1854 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1855 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1856 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1857 back_max = back_min = prog->anchored_offset;
1858 } else {
1859 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1860 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1861 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1862 back_max = prog->float_max_offset;
1863 back_min = prog->float_min_offset;
1864 }
1865 if (must == &PL_sv_undef)
1866 /* could not downgrade utf8 check substring, so must fail */
1867 goto phooey;
1868
1869 last = HOP3c(strend, /* Cannot start after this */
1870 -(I32)(CHR_SVLEN(must)
1871 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1872
1873 if (s > PL_bostr)
1874 last1 = HOPc(s, -1);
1875 else
1876 last1 = s - 1; /* bogus */
1877
1878 /* XXXX check_substr already used to find `s', can optimize if
1879 check_substr==must. */
1880 scream_pos = -1;
1881 dontbother = end_shift;
1882 strend = HOPc(strend, -dontbother);
1883 while ( (s <= last) &&
9041c2e3 1884 ((flags & REXEC_SCREAM)
1aa99e6b 1885 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1886 end_shift, &scream_pos, 0))
1aa99e6b 1887 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1888 (unsigned char*)strend, must,
7fba1cd6 1889 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1890 /* we may be pointing at the wrong string */
1891 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1892 s = strbeg + (s - SvPVX(sv));
bf93d4cc 1893 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1894 if (HOPc(s, -back_max) > last1) {
1895 last1 = HOPc(s, -back_min);
1896 s = HOPc(s, -back_max);
1897 }
1898 else {
1899 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1900
1901 last1 = HOPc(s, -back_min);
1902 s = t;
1903 }
1aa99e6b 1904 if (do_utf8) {
6eb5f6b9
JH
1905 while (s <= last1) {
1906 if (regtry(prog, s))
1907 goto got_it;
1908 s += UTF8SKIP(s);
1909 }
1910 }
1911 else {
1912 while (s <= last1) {
1913 if (regtry(prog, s))
1914 goto got_it;
1915 s++;
1916 }
1917 }
1918 }
b7953727
JH
1919 DEBUG_r(if (!did_match)
1920 PerlIO_printf(Perl_debug_log,
1921 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1922 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1923 ? "anchored" : "floating"),
1924 PL_colors[0],
1925 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1926 SvPVX(must),
b7953727
JH
1927 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1928 );
6eb5f6b9
JH
1929 goto phooey;
1930 }
155aba94 1931 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1932 if (minlen) {
1933 I32 op = (U8)OP(prog->regstclass);
66e933ab 1934 /* don't bother with what can't match */
f14c76ed
RGS
1935 if (PL_regkind[op] != EXACT && op != CANY)
1936 strend = HOPc(strend, -(minlen - 1));
1937 }
ffc61ed2
JH
1938 DEBUG_r({
1939 SV *prop = sv_newmortal();
9e55ce06
JH
1940 char *s0;
1941 char *s1;
1942 int len0;
1943 int len1;
1944
ffc61ed2 1945 regprop(prop, c);
9e55ce06
JH
1946 s0 = UTF ?
1947 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1948 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1949 SvPVX(prop);
1950 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1951 s1 = UTF ?
c728cb41 1952 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1953 len1 = UTF ? SvCUR(dsv1) : strend - s;
1954 PerlIO_printf(Perl_debug_log,
1955 "Matching stclass `%*.*s' against `%*.*s'\n",
1956 len0, len0, s0,
1957 len1, len1, s1);
ffc61ed2 1958 });
6eb5f6b9
JH
1959 if (find_byclass(prog, c, s, strend, startpos, 0))
1960 goto got_it;
bf93d4cc 1961 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1962 }
1963 else {
1964 dontbother = 0;
33b8afdf
JH
1965 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1966 /* Trim the end. */
d6a28714 1967 char *last;
33b8afdf
JH
1968 SV* float_real;
1969
1970 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1971 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1972 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1973
1974 if (flags & REXEC_SCREAM) {
33b8afdf 1975 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1976 end_shift, &scream_pos, 1); /* last one */
1977 if (!last)
ffc61ed2 1978 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1979 /* we may be pointing at the wrong string */
1980 else if (RX_MATCH_COPIED(prog))
7ef91622 1981 s = strbeg + (s - SvPVX(sv));
b8c5462f 1982 }
d6a28714
JH
1983 else {
1984 STRLEN len;
33b8afdf 1985 char *little = SvPV(float_real, len);
d6a28714 1986
33b8afdf 1987 if (SvTAIL(float_real)) {
d6a28714
JH
1988 if (memEQ(strend - len + 1, little, len - 1))
1989 last = strend - len + 1;
7fba1cd6 1990 else if (!multiline)
9041c2e3 1991 last = memEQ(strend - len, little, len)
d6a28714 1992 ? strend - len : Nullch;
b8c5462f 1993 else
d6a28714
JH
1994 goto find_last;
1995 } else {
1996 find_last:
9041c2e3 1997 if (len)
d6a28714 1998 last = rninstr(s, strend, little, little + len);
b8c5462f 1999 else
d6a28714 2000 last = strend; /* matching `$' */
b8c5462f 2001 }
b8c5462f 2002 }
bf93d4cc
GS
2003 if (last == NULL) {
2004 DEBUG_r(PerlIO_printf(Perl_debug_log,
2005 "%sCan't trim the tail, match fails (should not happen)%s\n",
2006 PL_colors[4],PL_colors[5]));
2007 goto phooey; /* Should not happen! */
2008 }
d6a28714
JH
2009 dontbother = strend - last + prog->float_min_offset;
2010 }
2011 if (minlen && (dontbother < minlen))
2012 dontbother = minlen - 1;
2013 strend -= dontbother; /* this one's always in bytes! */
2014 /* We don't know much -- general case. */
1aa99e6b 2015 if (do_utf8) {
d6a28714
JH
2016 for (;;) {
2017 if (regtry(prog, s))
2018 goto got_it;
2019 if (s >= strend)
2020 break;
b8c5462f 2021 s += UTF8SKIP(s);
d6a28714
JH
2022 };
2023 }
2024 else {
2025 do {
2026 if (regtry(prog, s))
2027 goto got_it;
2028 } while (s++ < strend);
2029 }
2030 }
2031
2032 /* Failure. */
2033 goto phooey;
2034
2035got_it:
2036 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2037
2038 if (PL_reg_eval_set) {
2039 /* Preserve the current value of $^R */
2040 if (oreplsv != GvSV(PL_replgv))
2041 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2042 restored, the value remains
2043 the same. */
acfe0abc 2044 restore_pos(aTHX_ 0);
d6a28714
JH
2045 }
2046
2047 /* make sure $`, $&, $', and $digit will work later */
2048 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2049 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2050 if (flags & REXEC_COPY_STR) {
2051 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2052#ifdef PERL_COPY_ON_WRITE
2053 if ((SvIsCOW(sv)
2054 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2055 if (DEBUG_C_TEST) {
2056 PerlIO_printf(Perl_debug_log,
2057 "Copy on write: regexp capture, type %d\n",
2058 (int) SvTYPE(sv));
2059 }
2060 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2061 prog->subbeg = SvPVX(prog->saved_copy);
2062 assert (SvPOKp(prog->saved_copy));
2063 } else
2064#endif
2065 {
2066 RX_MATCH_COPIED_on(prog);
2067 s = savepvn(strbeg, i);
2068 prog->subbeg = s;
2069 }
d6a28714 2070 prog->sublen = i;
d6a28714
JH
2071 }
2072 else {
2073 prog->subbeg = strbeg;
2074 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2075 }
2076 }
9041c2e3 2077
d6a28714
JH
2078 return 1;
2079
2080phooey:
bf93d4cc
GS
2081 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2082 PL_colors[4],PL_colors[5]));
d6a28714 2083 if (PL_reg_eval_set)
acfe0abc 2084 restore_pos(aTHX_ 0);
d6a28714
JH
2085 return 0;
2086}
2087
2088/*
2089 - regtry - try match at specific point
2090 */
2091STATIC I32 /* 0 failure, 1 success */
2092S_regtry(pTHX_ regexp *prog, char *startpos)
2093{
d6a28714
JH
2094 register I32 i;
2095 register I32 *sp;
2096 register I32 *ep;
2097 CHECKPOINT lastcp;
2098
02db2b7b
IZ
2099#ifdef DEBUGGING
2100 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2101#endif
d6a28714
JH
2102 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2103 MAGIC *mg;
2104
2105 PL_reg_eval_set = RS_init;
2106 DEBUG_r(DEBUG_s(
b900a521
JH
2107 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2108 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2109 ));
e8347627 2110 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2111 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2112 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2113 SAVETMPS;
2114 /* Apparently this is not needed, judging by wantarray. */
e8347627 2115 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2116 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2117
2118 if (PL_reg_sv) {
2119 /* Make $_ available to executed code. */
2120 if (PL_reg_sv != DEFSV) {
59f00321 2121 SAVE_DEFSV;
d6a28714 2122 DEFSV = PL_reg_sv;
b8c5462f 2123 }
d6a28714 2124
9041c2e3 2125 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2126 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2127 /* prepare for quick setting of pos */
14befaf4
DM
2128 sv_magic(PL_reg_sv, (SV*)0,
2129 PERL_MAGIC_regex_global, Nullch, 0);
2130 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2131 mg->mg_len = -1;
b8c5462f 2132 }
d6a28714
JH
2133 PL_reg_magic = mg;
2134 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2135 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2136 }
09687e5a 2137 if (!PL_reg_curpm) {
0f79a09d 2138 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2139#ifdef USE_ITHREADS
2140 {
2141 SV* repointer = newSViv(0);
577e12cc 2142 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2143 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2144 av_push(PL_regex_padav,repointer);
2145 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2146 PL_regex_pad = AvARRAY(PL_regex_padav);
2147 }
2148#endif
2149 }
aaa362c4 2150 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2151 PL_reg_oldcurpm = PL_curpm;
2152 PL_curpm = PL_reg_curpm;
2153 if (RX_MATCH_COPIED(prog)) {
2154 /* Here is a serious problem: we cannot rewrite subbeg,
2155 since it may be needed if this match fails. Thus
2156 $` inside (?{}) could fail... */
2157 PL_reg_oldsaved = prog->subbeg;
2158 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2159#ifdef PERL_COPY_ON_WRITE
2160 PL_nrs = prog->saved_copy;
2161#endif
d6a28714
JH
2162 RX_MATCH_COPIED_off(prog);
2163 }
2164 else
2165 PL_reg_oldsaved = Nullch;
2166 prog->subbeg = PL_bostr;
2167 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2168 }
2169 prog->startp[0] = startpos - PL_bostr;
2170 PL_reginput = startpos;
2171 PL_regstartp = prog->startp;
2172 PL_regendp = prog->endp;
2173 PL_reglastparen = &prog->lastparen;
a01268b5 2174 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2175 prog->lastparen = 0;
03994de8 2176 prog->lastcloseparen = 0;
d6a28714
JH
2177 PL_regsize = 0;
2178 DEBUG_r(PL_reg_starttry = startpos);
2179 if (PL_reg_start_tmpl <= prog->nparens) {
2180 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2181 if(PL_reg_start_tmp)
2182 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2183 else
2184 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2185 }
2186
2187 /* XXXX What this code is doing here?!!! There should be no need
2188 to do this again and again, PL_reglastparen should take care of
3dd2943c 2189 this! --ilya*/
dafc8851
JH
2190
2191 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2192 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2193 * PL_reglastparen), is not needed at all by the test suite
2194 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2195 * enough, for building DynaLoader, or otherwise this
2196 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2197 * will happen. Meanwhile, this code *is* needed for the
2198 * above-mentioned test suite tests to succeed. The common theme
2199 * on those tests seems to be returning null fields from matches.
2200 * --jhi */
dafc8851 2201#if 1
d6a28714
JH
2202 sp = prog->startp;
2203 ep = prog->endp;
2204 if (prog->nparens) {
eb160463 2205 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2206 *++sp = -1;
2207 *++ep = -1;
2208 }
2209 }
dafc8851 2210#endif
02db2b7b 2211 REGCP_SET(lastcp);
d6a28714
JH
2212 if (regmatch(prog->program + 1)) {
2213 prog->endp[0] = PL_reginput - PL_bostr;
2214 return 1;
2215 }
02db2b7b 2216 REGCP_UNWIND(lastcp);
d6a28714
JH
2217 return 0;
2218}
2219
02db2b7b
IZ
2220#define RE_UNWIND_BRANCH 1
2221#define RE_UNWIND_BRANCHJ 2
2222
2223union re_unwind_t;
2224
2225typedef struct { /* XX: makes sense to enlarge it... */
2226 I32 type;
2227 I32 prev;
2228 CHECKPOINT lastcp;
2229} re_unwind_generic_t;
2230
2231typedef struct {
2232 I32 type;
2233 I32 prev;
2234 CHECKPOINT lastcp;
2235 I32 lastparen;
2236 regnode *next;
2237 char *locinput;
2238 I32 nextchr;
2239#ifdef DEBUGGING
2240 int regindent;
2241#endif
2242} re_unwind_branch_t;
2243
2244typedef union re_unwind_t {
2245 I32 type;
2246 re_unwind_generic_t generic;
2247 re_unwind_branch_t branch;
2248} re_unwind_t;
2249
8ba1375e
MJD
2250#define sayYES goto yes
2251#define sayNO goto no
e0f9d4a8 2252#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2253#define sayYES_FINAL goto yes_final
2254#define sayYES_LOUD goto yes_loud
2255#define sayNO_FINAL goto no_final
2256#define sayNO_SILENT goto do_no
2257#define saySAME(x) if (x) goto yes; else goto no
2258
2259#define REPORT_CODE_OFF 24
2260
d6a28714
JH
2261/*
2262 - regmatch - main matching routine
2263 *
2264 * Conceptually the strategy is simple: check to see whether the current
2265 * node matches, call self recursively to see whether the rest matches,
2266 * and then act accordingly. In practice we make some effort to avoid
2267 * recursion, in particular by going through "ordinary" nodes (that don't
2268 * need to know whether the rest of the match failed) by a loop instead of
2269 * by recursion.
2270 */
2271/* [lwall] I've hoisted the register declarations to the outer block in order to
2272 * maybe save a little bit of pushing and popping on the stack. It also takes
2273 * advantage of machines that use a register save mask on subroutine entry.
2274 */
2275STATIC I32 /* 0 failure, 1 success */
2276S_regmatch(pTHX_ regnode *prog)
2277{
d6a28714
JH
2278 register regnode *scan; /* Current node. */
2279 regnode *next; /* Next node. */
2280 regnode *inner; /* Next node in internal branch. */
2281 register I32 nextchr; /* renamed nextchr - nextchar colides with
2282 function of same name */
2283 register I32 n; /* no or next */
b7953727
JH
2284 register I32 ln = 0; /* len or last */
2285 register char *s = Nullch; /* operand or save */
d6a28714 2286 register char *locinput = PL_reginput;
b7953727 2287 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2288 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2289 I32 unwind = 0;
b7953727 2290#if 0
02db2b7b 2291 I32 firstcp = PL_savestack_ix;
b7953727 2292#endif
53c4c00c 2293 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2294#ifdef DEBUGGING
ce333219
JH
2295 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2296 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2297 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2298#endif
02db2b7b 2299
d6a28714
JH
2300#ifdef DEBUGGING
2301 PL_regindent++;
2302#endif
2303
2304 /* Note that nextchr is a byte even in UTF */
2305 nextchr = UCHARAT(locinput);
2306 scan = prog;
2307 while (scan != NULL) {
8ba1375e 2308
2a782b5b 2309 DEBUG_r( {
d6a28714
JH
2310 SV *prop = sv_newmortal();
2311 int docolor = *PL_colors[0];
2312 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2313 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2314 /* The part of the string before starttry has one color
2315 (pref0_len chars), between starttry and current
2316 position another one (pref_len - pref0_len chars),
2317 after the current position the third one.
2318 We assume that pref0_len <= pref_len, otherwise we
2319 decrease pref0_len. */
9041c2e3 2320 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2321 ? (5 + taill) - l : locinput - PL_bostr;
2322 int pref0_len;
d6a28714 2323
df1ffd02 2324 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2325 pref_len++;
2326 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2327 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2328 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2329 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2330 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2331 l--;
d6a28714
JH
2332 if (pref0_len < 0)
2333 pref0_len = 0;
2334 if (pref0_len > pref_len)
2335 pref0_len = pref_len;
2336 regprop(prop, scan);
2a782b5b
JH
2337 {
2338 char *s0 =
f14c76ed 2339 do_utf8 && OP(scan) != CANY ?
2a782b5b 2340 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2341 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2342 locinput - pref_len;
df1ffd02 2343 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2344 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2345 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2346 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2347 locinput - pref_len + pref0_len;
df1ffd02 2348 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2349 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2350 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2351 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2352 locinput;
df1ffd02 2353 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2354 PerlIO_printf(Perl_debug_log,
2355 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2356 (IV)(locinput - PL_bostr),
2357 PL_colors[4],
2358 len0, s0,
2359 PL_colors[5],
2360 PL_colors[2],
2361 len1, s1,
2362 PL_colors[3],
2363 (docolor ? "" : "> <"),
2364 PL_colors[0],
2365 len2, s2,
2366 PL_colors[1],
2367 15 - l - pref_len + 1,
2368 "",
2369 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2370 SvPVX(prop));
2371 }
2372 });
d6a28714
JH
2373
2374 next = scan + NEXT_OFF(scan);
2375 if (next == scan)
2376 next = NULL;
2377
2378 switch (OP(scan)) {
2379 case BOL:
7fba1cd6 2380 if (locinput == PL_bostr)
d6a28714
JH
2381 {
2382 /* regtill = regbol; */
b8c5462f
JH
2383 break;
2384 }
d6a28714
JH
2385 sayNO;
2386 case MBOL:
12d33761
HS
2387 if (locinput == PL_bostr ||
2388 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2389 {
b8c5462f
JH
2390 break;
2391 }
d6a28714
JH
2392 sayNO;
2393 case SBOL:
c2a73568 2394 if (locinput == PL_bostr)
b8c5462f 2395 break;
d6a28714
JH
2396 sayNO;
2397 case GPOS:
2398 if (locinput == PL_reg_ganch)
2399 break;
2400 sayNO;
2401 case EOL:
d6a28714
JH
2402 goto seol;
2403 case MEOL:
d6a28714 2404 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2405 sayNO;
b8c5462f 2406 break;
d6a28714
JH
2407 case SEOL:
2408 seol:
2409 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2410 sayNO;
d6a28714 2411 if (PL_regeol - locinput > 1)
b8c5462f 2412 sayNO;
b8c5462f 2413 break;
d6a28714
JH
2414 case EOS:
2415 if (PL_regeol != locinput)
b8c5462f 2416 sayNO;
d6a28714 2417 break;
ffc61ed2 2418 case SANY:
d6a28714 2419 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2420 sayNO;
f33976b4
DB
2421 if (do_utf8) {
2422 locinput += PL_utf8skip[nextchr];
2423 if (locinput > PL_regeol)
2424 sayNO;
2425 nextchr = UCHARAT(locinput);
2426 }
2427 else
2428 nextchr = UCHARAT(++locinput);
2429 break;
2430 case CANY:
2431 if (!nextchr && locinput >= PL_regeol)
2432 sayNO;
b8c5462f 2433 nextchr = UCHARAT(++locinput);
a0d0e21e 2434 break;
ffc61ed2 2435 case REG_ANY:
1aa99e6b
IH
2436 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2437 sayNO;
2438 if (do_utf8) {
b8c5462f 2439 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2440 if (locinput > PL_regeol)
2441 sayNO;
a0ed51b3 2442 nextchr = UCHARAT(locinput);
a0ed51b3 2443 }
1aa99e6b
IH
2444 else
2445 nextchr = UCHARAT(++locinput);
a0ed51b3 2446 break;
d6a28714 2447 case EXACT:
cd439c50
IZ
2448 s = STRING(scan);
2449 ln = STR_LEN(scan);
eb160463 2450 if (do_utf8 != UTF) {
bc517b45 2451 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2452 char *l = locinput;
2453 char *e = s + ln;
bc517b45 2454 STRLEN ulen;
a72c7584 2455
5ff6fc6d
JH
2456 if (do_utf8) {
2457 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2458 while (s < e) {
2459 if (l >= PL_regeol)
5ff6fc6d
JH
2460 sayNO;
2461 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2462 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2463 ckWARN(WARN_UTF8) ?
2464 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2465 sayNO;
bc517b45 2466 l += ulen;
5ff6fc6d 2467 s ++;
1aa99e6b 2468 }
5ff6fc6d
JH
2469 }
2470 else {
2471 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2472 while (s < e) {
2473 if (l >= PL_regeol)
2474 sayNO;
5ff6fc6d 2475 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2476 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2477 ckWARN(WARN_UTF8) ?
2478 0 : UTF8_ALLOW_ANY))
1aa99e6b 2479 sayNO;
bc517b45 2480 s += ulen;
a72c7584 2481 l ++;
1aa99e6b 2482 }
5ff6fc6d 2483 }
1aa99e6b
IH
2484 locinput = l;
2485 nextchr = UCHARAT(locinput);
2486 break;
2487 }
bc517b45 2488 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2489 /* Inline the first character, for speed. */
2490 if (UCHARAT(s) != nextchr)
2491 sayNO;
2492 if (PL_regeol - locinput < ln)
2493 sayNO;
2494 if (ln > 1 && memNE(s, locinput, ln))
2495 sayNO;
2496 locinput += ln;
2497 nextchr = UCHARAT(locinput);
2498 break;
2499 case EXACTFL:
b8c5462f
JH
2500 PL_reg_flags |= RF_tainted;
2501 /* FALL THROUGH */
d6a28714 2502 case EXACTF:
cd439c50
IZ
2503 s = STRING(scan);
2504 ln = STR_LEN(scan);
d6a28714 2505
d07ddd77
JH
2506 if (do_utf8 || UTF) {
2507 /* Either target or the pattern are utf8. */
d6a28714 2508 char *l = locinput;
d07ddd77 2509 char *e = PL_regeol;
bc517b45 2510
eb160463 2511 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2512 l, &e, 0, do_utf8)) {
5486206c
JH
2513 /* One more case for the sharp s:
2514 * pack("U0U*", 0xDF) =~ /ss/i,
2515 * the 0xC3 0x9F are the UTF-8
2516 * byte sequence for the U+00DF. */
2517 if (!(do_utf8 &&
2518 toLOWER(s[0]) == 's' &&
2519 ln >= 2 &&
2520 toLOWER(s[1]) == 's' &&
2521 (U8)l[0] == 0xC3 &&
2522 e - l >= 2 &&
2523 (U8)l[1] == 0x9F))
2524 sayNO;
2525 }
d07ddd77
JH
2526 locinput = e;
2527 nextchr = UCHARAT(locinput);
2528 break;
a0ed51b3 2529 }
d6a28714 2530
bc517b45
JH
2531 /* Neither the target and the pattern are utf8. */
2532
d6a28714
JH
2533 /* Inline the first character, for speed. */
2534 if (UCHARAT(s) != nextchr &&
2535 UCHARAT(s) != ((OP(scan) == EXACTF)
2536 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2537 sayNO;
d6a28714 2538 if (PL_regeol - locinput < ln)
b8c5462f 2539 sayNO;
d6a28714
JH
2540 if (ln > 1 && (OP(scan) == EXACTF
2541 ? ibcmp(s, locinput, ln)
2542 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2543 sayNO;
d6a28714
JH
2544 locinput += ln;
2545 nextchr = UCHARAT(locinput);
a0d0e21e 2546 break;
d6a28714 2547 case ANYOF:
ffc61ed2 2548 if (do_utf8) {
9e55ce06
JH
2549 STRLEN inclasslen = PL_regeol - locinput;
2550
ba7b4546 2551 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2552 sayNO_ANYOF;
ffc61ed2
JH
2553 if (locinput >= PL_regeol)
2554 sayNO;
0f0076b4 2555 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2556 nextchr = UCHARAT(locinput);
e0f9d4a8 2557 break;
ffc61ed2
JH
2558 }
2559 else {
2560 if (nextchr < 0)
2561 nextchr = UCHARAT(locinput);
7d3e948e 2562 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2563 sayNO_ANYOF;
ffc61ed2
JH
2564 if (!nextchr && locinput >= PL_regeol)
2565 sayNO;
2566 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2567 break;
2568 }
2569 no_anyof:
2570 /* If we might have the case of the German sharp s
2571 * in a casefolding Unicode character class. */
2572
ebc501f0
JH
2573 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2574 locinput += SHARP_S_SKIP;
e0f9d4a8 2575 nextchr = UCHARAT(locinput);
ffc61ed2 2576 }
e0f9d4a8
JH
2577 else
2578 sayNO;
b8c5462f 2579 break;
d6a28714 2580 case ALNUML:
b8c5462f
JH
2581 PL_reg_flags |= RF_tainted;
2582 /* FALL THROUGH */
d6a28714 2583 case ALNUM:
b8c5462f 2584 if (!nextchr)
4633a7c4 2585 sayNO;
ffc61ed2 2586 if (do_utf8) {
ad24be35 2587 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2588 if (!(OP(scan) == ALNUM
3568d838 2589 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2590 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2591 {
2592 sayNO;
a0ed51b3 2593 }
b8c5462f 2594 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2595 nextchr = UCHARAT(locinput);
2596 break;
2597 }
ffc61ed2 2598 if (!(OP(scan) == ALNUM
d6a28714 2599 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2600 sayNO;
b8c5462f 2601 nextchr = UCHARAT(++locinput);
a0d0e21e 2602 break;
d6a28714 2603 case NALNUML:
b8c5462f
JH
2604 PL_reg_flags |= RF_tainted;
2605 /* FALL THROUGH */
d6a28714
JH
2606 case NALNUM:
2607 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2608 sayNO;
ffc61ed2 2609 if (do_utf8) {
8269fa76 2610 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2611 if (OP(scan) == NALNUM
3568d838 2612 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2613 : isALNUM_LC_utf8((U8*)locinput))
2614 {
b8c5462f 2615 sayNO;
d6a28714 2616 }
b8c5462f
JH
2617 locinput += PL_utf8skip[nextchr];
2618 nextchr = UCHARAT(locinput);
2619 break;
2620 }
ffc61ed2 2621 if (OP(scan) == NALNUM
d6a28714 2622 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2623 sayNO;
76e3520e 2624 nextchr = UCHARAT(++locinput);
a0d0e21e 2625 break;
d6a28714
JH
2626 case BOUNDL:
2627 case NBOUNDL:
3280af22 2628 PL_reg_flags |= RF_tainted;
bbce6d69 2629 /* FALL THROUGH */
d6a28714
JH
2630 case BOUND:
2631 case NBOUND:
2632 /* was last char in word? */
ffc61ed2 2633 if (do_utf8) {
12d33761
HS
2634 if (locinput == PL_bostr)
2635 ln = '\n';
ffc61ed2 2636 else {
b4f7163a 2637 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2638
b4f7163a 2639 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2640 }
2641 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2642 ln = isALNUM_uni(ln);
8269fa76 2643 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2644 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2645 }
2646 else {
9041c2e3 2647 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2648 n = isALNUM_LC_utf8((U8*)locinput);
2649 }
a0ed51b3 2650 }
d6a28714 2651 else {
12d33761
HS
2652 ln = (locinput != PL_bostr) ?
2653 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2654 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2655 ln = isALNUM(ln);
2656 n = isALNUM(nextchr);
2657 }
2658 else {
2659 ln = isALNUM_LC(ln);
2660 n = isALNUM_LC(nextchr);
2661 }
d6a28714 2662 }
ffc61ed2
JH
2663 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2664 OP(scan) == BOUNDL))
2665 sayNO;
a0ed51b3 2666 break;
d6a28714 2667 case SPACEL:
3280af22 2668 PL_reg_flags |= RF_tainted;
bbce6d69 2669 /* FALL THROUGH */
d6a28714 2670 case SPACE:
9442cb0e 2671 if (!nextchr)
4633a7c4 2672 sayNO;
1aa99e6b 2673 if (do_utf8) {
fd400ab9 2674 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2675 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2676 if (!(OP(scan) == SPACE
3568d838 2677 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2678 : isSPACE_LC_utf8((U8*)locinput)))
2679 {
2680 sayNO;
2681 }
2682 locinput += PL_utf8skip[nextchr];
2683 nextchr = UCHARAT(locinput);
2684 break;
d6a28714 2685 }
ffc61ed2
JH
2686 if (!(OP(scan) == SPACE
2687 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2688 sayNO;
2689 nextchr = UCHARAT(++locinput);
2690 }
2691 else {
2692 if (!(OP(scan) == SPACE
2693 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2694 sayNO;
2695 nextchr = UCHARAT(++locinput);
a0ed51b3 2696 }
a0ed51b3 2697 break;
d6a28714 2698 case NSPACEL:
3280af22 2699 PL_reg_flags |= RF_tainted;
bbce6d69 2700 /* FALL THROUGH */
d6a28714 2701 case NSPACE:
9442cb0e 2702 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2703 sayNO;
1aa99e6b 2704 if (do_utf8) {
8269fa76 2705 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2706 if (OP(scan) == NSPACE
3568d838 2707 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2708 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2709 {
2710 sayNO;
2711 }
2712 locinput += PL_utf8skip[nextchr];
2713 nextchr = UCHARAT(locinput);
2714 break;
a0ed51b3 2715 }
ffc61ed2 2716 if (OP(scan) == NSPACE
d6a28714 2717 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2718 sayNO;
76e3520e 2719 nextchr = UCHARAT(++locinput);
a0d0e21e 2720 break;
d6a28714 2721 case DIGITL:
a0ed51b3
LW
2722 PL_reg_flags |= RF_tainted;
2723 /* FALL THROUGH */
d6a28714 2724 case DIGIT:
9442cb0e 2725 if (!nextchr)
a0ed51b3 2726 sayNO;
1aa99e6b 2727 if (do_utf8) {
8269fa76 2728 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2729 if (!(OP(scan) == DIGIT
3568d838 2730 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2731 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2732 {
a0ed51b3 2733 sayNO;
dfe13c55 2734 }
6f06b55f 2735 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2736 nextchr = UCHARAT(locinput);
2737 break;
2738 }
ffc61ed2 2739 if (!(OP(scan) == DIGIT
9442cb0e 2740 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2741 sayNO;
2742 nextchr = UCHARAT(++locinput);
2743 break;
d6a28714 2744 case NDIGITL:
b8c5462f
JH
2745 PL_reg_flags |= RF_tainted;
2746 /* FALL THROUGH */
d6a28714 2747 case NDIGIT:
9442cb0e 2748 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2749 sayNO;
1aa99e6b 2750 if (do_utf8) {
8269fa76 2751 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2752 if (OP(scan) == NDIGIT
3568d838 2753 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2754 : isDIGIT_LC_utf8((U8*)locinput))
2755 {
a0ed51b3 2756 sayNO;
9442cb0e 2757 }
6f06b55f 2758 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2759 nextchr = UCHARAT(locinput);
2760 break;
2761 }
ffc61ed2 2762 if (OP(scan) == NDIGIT
9442cb0e 2763 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2764 sayNO;
2765 nextchr = UCHARAT(++locinput);
2766 break;
2767 case CLUMP:
b7c83a7e 2768 if (locinput >= PL_regeol)
a0ed51b3 2769 sayNO;
b7c83a7e
JH
2770 if (do_utf8) {
2771 LOAD_UTF8_CHARCLASS(mark,"~");
2772 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2773 sayNO;
2774 locinput += PL_utf8skip[nextchr];
2775 while (locinput < PL_regeol &&
2776 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2777 locinput += UTF8SKIP(locinput);
2778 if (locinput > PL_regeol)
2779 sayNO;
eb08e2da
JH
2780 }
2781 else
2782 locinput++;
a0ed51b3
LW
2783 nextchr = UCHARAT(locinput);
2784 break;
c8756f30 2785 case REFFL:
3280af22 2786 PL_reg_flags |= RF_tainted;
c8756f30 2787 /* FALL THROUGH */
c277df42 2788 case REF:
c8756f30 2789 case REFF:
c277df42 2790 n = ARG(scan); /* which paren pair */
cf93c79d 2791 ln = PL_regstartp[n];
2c2d71f5 2792 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2793 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2794 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2795 if (ln == PL_regendp[n])
a0d0e21e 2796 break;
a0ed51b3 2797
cf93c79d 2798 s = PL_bostr + ln;
1aa99e6b 2799 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2800 char *l = locinput;
cf93c79d 2801 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2802 /*
2803 * Note that we can't do the "other character" lookup trick as
2804 * in the 8-bit case (no pun intended) because in Unicode we
2805 * have to map both upper and title case to lower case.
2806 */
2807 if (OP(scan) == REFF) {
a2a2844f 2808 STRLEN ulen1, ulen2;
e7ae6809
JH
2809 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2810 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2811 while (s < e) {
2812 if (l >= PL_regeol)
2813 sayNO;
a2a2844f
JH
2814 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2815 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2816 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2817 sayNO;
a2a2844f
JH
2818 s += ulen1;
2819 l += ulen2;
a0ed51b3
LW
2820 }
2821 }
2822 locinput = l;
2823 nextchr = UCHARAT(locinput);
2824 break;
2825 }
2826
a0d0e21e 2827 /* Inline the first character, for speed. */
76e3520e 2828 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2829 (OP(scan) == REF ||
2830 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2831 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2832 sayNO;
cf93c79d 2833 ln = PL_regendp[n] - ln;
3280af22 2834 if (locinput + ln > PL_regeol)
4633a7c4 2835 sayNO;
c8756f30
AK
2836 if (ln > 1 && (OP(scan) == REF
2837 ? memNE(s, locinput, ln)
2838 : (OP(scan) == REFF
2839 ? ibcmp(s, locinput, ln)
2840 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2841 sayNO;
a0d0e21e 2842 locinput += ln;
76e3520e 2843 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2844 break;
2845
2846 case NOTHING:
c277df42 2847 case TAIL:
a0d0e21e
LW
2848 break;
2849 case BACK:
2850 break;
c277df42
IZ
2851 case EVAL:
2852 {
2853 dSP;
533c011a 2854 OP_4tree *oop = PL_op;
3280af22 2855 COP *ocurcop = PL_curcop;
f3548bdc 2856 PAD *old_comppad;
c277df42 2857 SV *ret;
080c2dec 2858 struct regexp *oreg = PL_reg_re;
9041c2e3 2859
c277df42 2860 n = ARG(scan);
533c011a 2861 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2862 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2863 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2864 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2865
8e5e9ebe
RGS
2866 {
2867 SV **before = SP;
2868 CALLRUNOPS(aTHX); /* Scalar context. */
2869 SPAGAIN;
2870 if (SP == before)
075aa684 2871 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2872 else {
2873 ret = POPs;
2874 PUTBACK;
2875 }
2876 }
2877
0f5d15d6 2878 PL_op = oop;
f3548bdc 2879 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2880 PL_curcop = ocurcop;
c277df42 2881 if (logical) {
0f5d15d6
IZ
2882 if (logical == 2) { /* Postponed subexpression. */
2883 regexp *re;
22c35a8c 2884 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2885 re_cc_state state;
0f5d15d6 2886 CHECKPOINT cp, lastcp;
cb50f42d 2887 int toggleutf;
faf82a0b 2888 register SV *sv;
0f5d15d6 2889
faf82a0b
AE
2890 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2891 mg = mg_find(sv, PERL_MAGIC_qr);
2892 else if (SvSMAGICAL(ret)) {
2893 if (SvGMAGICAL(ret))
2894 sv_unmagic(ret, PERL_MAGIC_qr);
2895 else
2896 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2897 }
faf82a0b 2898
0f5d15d6
IZ
2899 if (mg) {
2900 re = (regexp *)mg->mg_obj;
df0003d4 2901 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2902 }
2903 else {
2904 STRLEN len;
2905 char *t = SvPV(ret, len);
2906 PMOP pm;
2907 char *oprecomp = PL_regprecomp;
2908 I32 osize = PL_regsize;
2909 I32 onpar = PL_regnpar;
2910
5fcd1c1b 2911 Zero(&pm, 1, PMOP);
cb50f42d 2912 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2913 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2914 if (!(SvFLAGS(ret)
faf82a0b
AE
2915 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2916 | SVs_GMG)))
14befaf4
DM
2917 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2918 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2919 PL_regprecomp = oprecomp;
2920 PL_regsize = osize;
2921 PL_regnpar = onpar;
2922 }
2923 DEBUG_r(
9041c2e3 2924 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2925 "Entering embedded `%s%.60s%s%s'\n",
2926 PL_colors[0],
2927 re->precomp,
2928 PL_colors[1],
2929 (strlen(re->precomp) > 60 ? "..." : ""))
2930 );
2931 state.node = next;
2932 state.prev = PL_reg_call_cc;
2933 state.cc = PL_regcc;
2934 state.re = PL_reg_re;
2935
2ab05381 2936 PL_regcc = 0;
9041c2e3 2937
0f5d15d6 2938 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2939 REGCP_SET(lastcp);
0f5d15d6
IZ
2940 cache_re(re);
2941 state.ss = PL_savestack_ix;
2942 *PL_reglastparen = 0;
a01268b5 2943 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2944 PL_reg_call_cc = &state;
2945 PL_reginput = locinput;
cb50f42d
YST
2946 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2947 ((re->reganch & ROPT_UTF8) != 0);
2948 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2949
2950 /* XXXX This is too dramatic a measure... */
2951 PL_reg_maxiter = 0;
2952
0f5d15d6 2953 if (regmatch(re->program + 1)) {
2c914db6
IZ
2954 /* Even though we succeeded, we need to restore
2955 global variables, since we may be wrapped inside
2956 SUSPEND, thus the match may be not finished yet. */
2957
2958 /* XXXX Do this only if SUSPENDed? */
2959 PL_reg_call_cc = state.prev;
2960 PL_regcc = state.cc;
2961 PL_reg_re = state.re;
2962 cache_re(PL_reg_re);
cb50f42d 2963 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2964
2965 /* XXXX This is too dramatic a measure... */
2966 PL_reg_maxiter = 0;
2967
2968 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2969 ReREFCNT_dec(re);
2970 regcpblow(cp);
2971 sayYES;
2972 }
0f5d15d6 2973 ReREFCNT_dec(re);
02db2b7b 2974 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2975 regcppop();
2976 PL_reg_call_cc = state.prev;
2977 PL_regcc = state.cc;
2978 PL_reg_re = state.re;
d3790889 2979 cache_re(PL_reg_re);
cb50f42d 2980 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2981
2982 /* XXXX This is too dramatic a measure... */
2983 PL_reg_maxiter = 0;
2984
8e514ae6 2985 logical = 0;
0f5d15d6
IZ
2986 sayNO;
2987 }
c277df42 2988 sw = SvTRUE(ret);
0f5d15d6 2989 logical = 0;
a0ed51b3 2990 }
080c2dec 2991 else {
3280af22 2992 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
2993 cache_re(oreg);
2994 }
c277df42
IZ
2995 break;
2996 }
a0d0e21e 2997 case OPEN:
c277df42 2998 n = ARG(scan); /* which paren pair */
3280af22
NIS
2999 PL_reg_start_tmp[n] = locinput;
3000 if (n > PL_regsize)
3001 PL_regsize = n;
a0d0e21e
LW
3002 break;
3003 case CLOSE:
c277df42 3004 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3005 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3006 PL_regendp[n] = locinput - PL_bostr;
eb160463 3007 if (n > (I32)*PL_reglastparen)
3280af22 3008 *PL_reglastparen = n;
a01268b5 3009 *PL_reglastcloseparen = n;
a0d0e21e 3010 break;
c277df42
IZ
3011 case GROUPP:
3012 n = ARG(scan); /* which paren pair */
eb160463 3013 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3014 break;
3015 case IFTHEN:
2c2d71f5 3016 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3017 if (sw)
3018 next = NEXTOPER(NEXTOPER(scan));
3019 else {
3020 next = scan + ARG(scan);
3021 if (OP(next) == IFTHEN) /* Fake one. */
3022 next = NEXTOPER(NEXTOPER(next));
3023 }
3024 break;
3025 case LOGICAL:
0f5d15d6 3026 logical = scan->flags;
c277df42 3027 break;
2ab05381
IZ
3028/*******************************************************************
3029 PL_regcc contains infoblock about the innermost (...)* loop, and
3030 a pointer to the next outer infoblock.
3031
3032 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3033
3034 1) After matching X, regnode for CURLYX is processed;
3035
9041c2e3 3036 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3037 regmatch() recursively with the starting point at WHILEM node;
3038
3039 3) Each hit of WHILEM node tries to match A and Z (in the order
3040 depending on the current iteration, min/max of {min,max} and
3041 greediness). The information about where are nodes for "A"
3042 and "Z" is read from the infoblock, as is info on how many times "A"
3043 was already matched, and greediness.
3044
3045 4) After A matches, the same WHILEM node is hit again.
3046
3047 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3048 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3049 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3050 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3051 of the external loop.
3052
3053 Currently present infoblocks form a tree with a stem formed by PL_curcc
3054 and whatever it mentions via ->next, and additional attached trees
3055 corresponding to temporarily unset infoblocks as in "5" above.
3056
9041c2e3 3057 In the following picture infoblocks for outer loop of
2ab05381
IZ
3058 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3059 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3060 infoblocks are drawn below the "reset" infoblock.
3061
3062 In fact in the picture below we do not show failed matches for Z and T
3063 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3064 more obvious *why* one needs to *temporary* unset infoblocks.]
3065
3066 Matched REx position InfoBlocks Comment
3067 (Y(A)*?Z)*?T x
3068 Y(A)*?Z)*?T x <- O
3069 Y (A)*?Z)*?T x <- O
3070 Y A)*?Z)*?T x <- O <- I
3071 YA )*?Z)*?T x <- O <- I
3072 YA A)*?Z)*?T x <- O <- I
3073 YAA )*?Z)*?T x <- O <- I
3074 YAA Z)*?T x <- O # Temporary unset I
3075 I
3076
3077 YAAZ Y(A)*?Z)*?T x <- O
3078 I
3079
3080 YAAZY (A)*?Z)*?T x <- O
3081 I
3082
3083 YAAZY A)*?Z)*?T x <- O <- I
3084 I
3085
3086 YAAZYA )*?Z)*?T x <- O <- I
3087 I
3088
3089 YAAZYA Z)*?T x <- O # Temporary unset I
3090 I,I
3091
3092 YAAZYAZ )*?T x <- O
3093 I,I
3094
3095 YAAZYAZ T x # Temporary unset O
3096 O
3097 I,I
3098
3099 YAAZYAZT x
3100 O
3101 I,I
3102 *******************************************************************/
a0d0e21e
LW
3103 case CURLYX: {
3104 CURCUR cc;
3280af22 3105 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3106 /* No need to save/restore up to this paren */
3107 I32 parenfloor = scan->flags;
c277df42
IZ
3108
3109 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3110 next += ARG(next);
3280af22
NIS
3111 cc.oldcc = PL_regcc;
3112 PL_regcc = &cc;
cb434fcc
IZ
3113 /* XXXX Probably it is better to teach regpush to support
3114 parenfloor > PL_regsize... */
eb160463 3115 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3116 parenfloor = *PL_reglastparen; /* Pessimization... */
3117 cc.parenfloor = parenfloor;
a0d0e21e
LW
3118 cc.cur = -1;
3119 cc.min = ARG1(scan);
3120 cc.max = ARG2(scan);
c277df42 3121 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3122 cc.next = next;
3123 cc.minmod = minmod;
3124 cc.lastloc = 0;
3280af22 3125 PL_reginput = locinput;
a0d0e21e
LW
3126 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3127 regcpblow(cp);
3280af22 3128 PL_regcc = cc.oldcc;
4633a7c4 3129 saySAME(n);
a0d0e21e
LW
3130 }
3131 /* NOT REACHED */
3132 case WHILEM: {
3133 /*
3134 * This is really hard to understand, because after we match
3135 * what we're trying to match, we must make sure the rest of
2c2d71f5 3136 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3137 * to go back UP the parse tree by recursing ever deeper. And
3138 * if it fails, we have to reset our parent's current state
3139 * that we can try again after backing off.
3140 */
3141
c277df42 3142 CHECKPOINT cp, lastcp;
3280af22 3143 CURCUR* cc = PL_regcc;
c277df42
IZ
3144 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3145
4633a7c4 3146 n = cc->cur + 1; /* how many we know we matched */
3280af22 3147 PL_reginput = locinput;
a0d0e21e 3148
c277df42 3149 DEBUG_r(
9041c2e3 3150 PerlIO_printf(Perl_debug_log,
91f3b821 3151 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3152 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3153 (long)n, (long)cc->min,
2797576d 3154 (long)cc->max, PTR2UV(cc))
c277df42 3155 );
4633a7c4 3156
a0d0e21e
LW
3157 /* If degenerate scan matches "", assume scan done. */
3158
579cf2c3 3159 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3160 PL_regcc = cc->oldcc;
2ab05381
IZ
3161 if (PL_regcc)
3162 ln = PL_regcc->cur;
c277df42 3163 DEBUG_r(
c3464db5
DD
3164 PerlIO_printf(Perl_debug_log,
3165 "%*s empty match detected, try continuation...\n",
3280af22 3166 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3167 );
a0d0e21e 3168 if (regmatch(cc->next))
4633a7c4 3169 sayYES;
2ab05381
IZ
3170 if (PL_regcc)
3171 PL_regcc->cur = ln;
3280af22 3172 PL_regcc = cc;
4633a7c4 3173 sayNO;
a0d0e21e
LW
3174 }
3175
3176 /* First just match a string of min scans. */
3177
3178 if (n < cc->min) {
3179 cc->cur = n;
3180 cc->lastloc = locinput;
4633a7c4
LW
3181 if (regmatch(cc->scan))
3182 sayYES;
3183 cc->cur = n - 1;
c277df42 3184 cc->lastloc = lastloc;
4633a7c4 3185 sayNO;
a0d0e21e
LW
3186 }
3187
2c2d71f5
JH
3188 if (scan->flags) {
3189 /* Check whether we already were at this position.
3190 Postpone detection until we know the match is not
3191 *that* much linear. */
3192 if (!PL_reg_maxiter) {
3193 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3194 PL_reg_leftiter = PL_reg_maxiter;
3195 }
3196 if (PL_reg_leftiter-- == 0) {
3197 I32 size = (PL_reg_maxiter + 7)/8;
3198 if (PL_reg_poscache) {
eb160463 3199 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3200 Renew(PL_reg_poscache, size, char);
3201 PL_reg_poscache_size = size;
3202 }
3203 Zero(PL_reg_poscache, size, char);
3204 }
3205 else {
3206 PL_reg_poscache_size = size;
3207 Newz(29, PL_reg_poscache, size, char);
3208 }
3209 DEBUG_r(
3210 PerlIO_printf(Perl_debug_log,
3211 "%sDetected a super-linear match, switching on caching%s...\n",
3212 PL_colors[4], PL_colors[5])
3213 );
3214 }
3215 if (PL_reg_leftiter < 0) {
3216 I32 o = locinput - PL_bostr, b;
3217
3218 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3219 b = o % 8;
3220 o /= 8;
3221 if (PL_reg_poscache[o] & (1<<b)) {
3222 DEBUG_r(
3223 PerlIO_printf(Perl_debug_log,
3224 "%*s already tried at this position...\n",
3225 REPORT_CODE_OFF+PL_regindent*2, "")
3226 );
c2b0868c
HS
3227 if (PL_reg_flags & RF_false)
3228 sayYES;
3229 else
3230 sayNO_SILENT;
2c2d71f5
JH
3231 }
3232 PL_reg_poscache[o] |= (1<<b);
3233 }
3234 }
3235
a0d0e21e
LW
3236 /* Prefer next over scan for minimal matching. */
3237
3238 if (cc->minmod) {
3280af22 3239 PL_regcc = cc->oldcc;
2ab05381
IZ
3240 if (PL_regcc)
3241 ln = PL_regcc->cur;
5f05dabc 3242 cp = regcppush(cc->parenfloor);
02db2b7b 3243 REGCP_SET(lastcp);
5f05dabc 3244 if (regmatch(cc->next)) {
c277df42 3245 regcpblow(cp);
4633a7c4 3246 sayYES; /* All done. */
5f05dabc 3247 }
02db2b7b 3248 REGCP_UNWIND(lastcp);
5f05dabc 3249 regcppop();
2ab05381
IZ
3250 if (PL_regcc)
3251 PL_regcc->cur = ln;
3280af22 3252 PL_regcc = cc;
a0d0e21e 3253
c277df42 3254 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3255 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3256 && !(PL_reg_flags & RF_warned)) {
3257 PL_reg_flags |= RF_warned;
9014280d 3258 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3259 "Complex regular subexpression recursion",
3260 REG_INFTY - 1);
c277df42 3261 }
4633a7c4 3262 sayNO;
c277df42 3263 }
a687059c 3264
c277df42 3265 DEBUG_r(
c3464db5
DD
3266 PerlIO_printf(Perl_debug_log,
3267 "%*s trying longer...\n",
3280af22 3268 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3269 );
a0d0e21e 3270 /* Try scanning more and see if it helps. */
3280af22 3271 PL_reginput = locinput;
a0d0e21e
LW
3272 cc->cur = n;
3273 cc->lastloc = locinput;
5f05dabc 3274 cp = regcppush(cc->parenfloor);
02db2b7b 3275 REGCP_SET(lastcp);
5f05dabc 3276 if (regmatch(cc->scan)) {
c277df42 3277 regcpblow(cp);
4633a7c4 3278 sayYES;
5f05dabc 3279 }
02db2b7b 3280 REGCP_UNWIND(lastcp);
5f05dabc 3281 regcppop();
4633a7c4 3282 cc->cur = n - 1;
c277df42 3283 cc->lastloc = lastloc;
4633a7c4 3284 sayNO;
a0d0e21e
LW
3285 }
3286
3287 /* Prefer scan over next for maximal matching. */
3288
3289 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3290 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3291 cc->cur = n;
3292 cc->lastloc = locinput;
02db2b7b 3293 REGCP_SET(lastcp);
5f05dabc 3294 if (regmatch(cc->scan)) {
c277df42 3295 regcpblow(cp);
4633a7c4 3296 sayYES;
5f05dabc 3297 }
02db2b7b 3298 REGCP_UNWIND(lastcp);
a0d0e21e 3299 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3300 PL_reginput = locinput;
c277df42 3301 DEBUG_r(
c3464db5
DD
3302 PerlIO_printf(Perl_debug_log,
3303 "%*s failed, try continuation...\n",
3280af22 3304 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3305 );
3306 }
9041c2e3 3307 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3308 && !(PL_reg_flags & RF_warned)) {
3280af22 3309 PL_reg_flags |= RF_warned;
9014280d 3310 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3311 "Complex regular subexpression recursion",
3312 REG_INFTY - 1);
a0d0e21e
LW
3313 }
3314
3315 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3316 PL_regcc = cc->oldcc;
2ab05381
IZ
3317 if (PL_regcc)
3318 ln = PL_regcc->cur;
a0d0e21e 3319 if (regmatch(cc->next))
4633a7c4 3320 sayYES;
2ab05381
IZ
3321 if (PL_regcc)
3322 PL_regcc->cur = ln;
3280af22 3323 PL_regcc = cc;
4633a7c4 3324 cc->cur = n - 1;
c277df42 3325 cc->lastloc = lastloc;
4633a7c4 3326 sayNO;
a0d0e21e
LW
3327 }
3328 /* NOT REACHED */
9041c2e3 3329 case BRANCHJ:
c277df42
IZ
3330 next = scan + ARG(scan);
3331 if (next == scan)
3332 next = NULL;
3333 inner = NEXTOPER(NEXTOPER(scan));
3334 goto do_branch;
9041c2e3 3335 case BRANCH:
c277df42
IZ
3336 inner = NEXTOPER(scan);
3337 do_branch:
3338 {
c277df42
IZ
3339 c1 = OP(scan);
3340 if (OP(next) != c1) /* No choice. */
3341 next = inner; /* Avoid recursion. */
a0d0e21e 3342 else {
02db2b7b
IZ
3343 I32 lastparen = *PL_reglastparen;
3344 I32 unwind1;
3345 re_unwind_branch_t *uw;
3346
3347 /* Put unwinding data on stack */
3348 unwind1 = SSNEWt(1,re_unwind_branch_t);
3349 uw = SSPTRt(unwind1,re_unwind_branch_t);
3350 uw->prev = unwind;
3351 unwind = unwind1;
3352 uw->type = ((c1 == BRANCH)
3353 ? RE_UNWIND_BRANCH
3354 : RE_UNWIND_BRANCHJ);
3355 uw->lastparen = lastparen;
3356 uw->next = next;
3357 uw->locinput = locinput;
3358 uw->nextchr = nextchr;
3359#ifdef DEBUGGING
3360 uw->regindent = ++PL_regindent;
3361#endif
c277df42 3362
02db2b7b
IZ
3363 REGCP_SET(uw->lastcp);
3364
3365 /* Now go into the first branch */
3366 next = inner;
a687059c 3367 }
a0d0e21e
LW
3368 }
3369 break;
3370 case MINMOD:
3371 minmod = 1;
3372 break;
c277df42
IZ
3373 case CURLYM:
3374 {
00db4c45 3375 I32 l = 0;
c277df42 3376 CHECKPOINT lastcp;
9041c2e3 3377
c277df42 3378 /* We suppose that the next guy does not need
0e788c72 3379 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3380 and has no parenths to influence future backrefs. */
3381 ln = ARG1(scan); /* min to match */
3382 n = ARG2(scan); /* max to match */
c277df42
IZ
3383 paren = scan->flags;
3384 if (paren) {
3280af22
NIS
3385 if (paren > PL_regsize)
3386 PL_regsize = paren;
eb160463 3387 if (paren > (I32)*PL_reglastparen)
3280af22 3388 *PL_reglastparen = paren;
c277df42 3389 }
dc45a647 3390 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3391 if (paren)
3392 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3393 PL_reginput = locinput;
c277df42
IZ
3394 if (minmod) {
3395 minmod = 0;
3396 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3397 sayNO;
3280af22 3398 locinput = PL_reginput;
cca55fe3 3399 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3400 regnode *text_node = next;
3401
cca55fe3 3402 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3403
cca55fe3 3404 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3405 else {
cca55fe3 3406 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3407 c1 = c2 = -1000;
3408 goto assume_ok_MM;
cca55fe3
JP
3409 }
3410 else { c1 = (U8)*STRING(text_node); }
af5decee 3411 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3412 c2 = PL_fold[c1];
af5decee 3413 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3414 c2 = PL_fold_locale[c1];
3415 else
3416 c2 = c1;
3417 }
a0ed51b3
LW
3418 }
3419 else
c277df42 3420 c1 = c2 = -1000;
cca55fe3 3421 assume_ok_MM:
02db2b7b 3422 REGCP_SET(lastcp);
0e788c72 3423 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3424 /* If it could work, try it. */
3425 if (c1 == -1000 ||
3280af22
NIS
3426 UCHARAT(PL_reginput) == c1 ||
3427 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3428 {
3429 if (paren) {
f31a99c8 3430 if (ln) {
cf93c79d
IZ
3431 PL_regstartp[paren] =
3432 HOPc(PL_reginput, -l) - PL_bostr;
3433 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3434 }
3435 else
cf93c79d 3436 PL_regendp[paren] = -1;
c277df42
IZ
3437 }
3438 if (regmatch(next))
3439 sayYES;
02db2b7b 3440 REGCP_UNWIND(lastcp);
c277df42
IZ
3441 }
3442 /* Couldn't or didn't -- move forward. */
3280af22 3443 PL_reginput = locinput;
c277df42
IZ
3444 if (regrepeat_hard(scan, 1, &l)) {
3445 ln++;
3280af22 3446 locinput = PL_reginput;
c277df42
IZ
3447 }
3448 else
3449 sayNO;
3450 }
a0ed51b3
LW
3451 }
3452 else {
c277df42 3453 n = regrepeat_hard(scan, n, &l);
3280af22 3454 locinput = PL_reginput;
c277df42 3455 DEBUG_r(
5c0ca799 3456 PerlIO_printf(Perl_debug_log,
faccc32b 3457 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3458 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3459 (IV) n, (IV)l)
c277df42
IZ
3460 );
3461 if (n >= ln) {
cca55fe3 3462 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3463 regnode *text_node = next;
3464
cca55fe3 3465 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3466
cca55fe3 3467 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3468 else {
cca55fe3 3469 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3470 c1 = c2 = -1000;
3471 goto assume_ok_REG;
cca55fe3
JP
3472 }
3473 else { c1 = (U8)*STRING(text_node); }
3474
af5decee 3475 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3476 c2 = PL_fold[c1];
af5decee 3477 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3478 c2 = PL_fold_locale[c1];
3479 else
3480 c2 = c1;
3481 }
a0ed51b3
LW
3482 }
3483 else
c277df42
IZ
3484 c1 = c2 = -1000;
3485 }
cca55fe3 3486 assume_ok_REG:
02db2b7b 3487 REGCP_SET(lastcp);
c277df42
IZ
3488 while (n >= ln) {
3489 /* If it could work, try it. */
3490 if (c1 == -1000 ||
3280af22
NIS
3491 UCHARAT(PL_reginput) == c1 ||
3492 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3493 {
3494 DEBUG_r(
c3464db5 3495 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3496 "%*s trying tail with n=%"IVdf"...\n",
3497 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3498 );
3499 if (paren) {
3500 if (n) {
cf93c79d
IZ
3501 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3502 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3503 }
a0ed51b3 3504 else
cf93c79d 3505 PL_regendp[paren] = -1;
c277df42 3506 }
a0ed51b3
LW
3507 if (regmatch(next))
3508 sayYES;
02db2b7b 3509 REGCP_UNWIND(lastcp);
a0ed51b3 3510 }
c277df42
IZ
3511 /* Couldn't or didn't -- back up. */
3512 n--;
dfe13c55 3513 locinput = HOPc(locinput, -l);
3280af22 3514 PL_reginput = locinput;
c277df42
IZ
3515 }
3516 }
3517 sayNO;
3518 break;
3519 }
3520 case CURLYN:
3521 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3522 if (paren > PL_regsize)
3523 PL_regsize = paren;
eb160463 3524 if (paren > (I32)*PL_reglastparen)
3280af22 3525 *PL_reglastparen = paren;
c277df42
IZ
3526 ln = ARG1(scan); /* min to match */
3527 n = ARG2(scan); /* max to match */
dc45a647 3528 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3529 goto repeat;
a0d0e21e 3530 case CURLY:
c277df42 3531 paren = 0;
a0d0e21e
LW
3532 ln = ARG1(scan); /* min to match */
3533 n = ARG2(scan); /* max to match */
dc45a647 3534 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3535 goto repeat;
3536 case STAR:
3537 ln = 0;
c277df42 3538 n = REG_INFTY;
a0d0e21e 3539 scan = NEXTOPER(scan);
c277df42 3540 paren = 0;
a0d0e21e
LW
3541 goto repeat;
3542 case PLUS:
c277df42
IZ
3543 ln = 1;
3544 n = REG_INFTY;
3545 scan = NEXTOPER(scan);
3546 paren = 0;
3547 repeat:
a0d0e21e
LW
3548 /*
3549 * Lookahead to avoid useless match attempts
3550 * when we know what character comes next.
3551 */
5f80c4cf
JP
3552
3553 /*
3554 * Used to only do .*x and .*?x, but now it allows
3555 * for )'s, ('s and (?{ ... })'s to be in the way
3556 * of the quantifier and the EXACT-like node. -- japhy
3557 */
3558
cca55fe3 3559 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3560 U8 *s;
3561 regnode *text_node = next;
3562
cca55fe3 3563 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3564
cca55fe3 3565 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3566 else {
cca55fe3 3567 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3568 c1 = c2 = -1000;
3569 goto assume_ok_easy;
cca55fe3
JP
3570 }
3571 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3572
3573 if (!UTF) {
3574 c2 = c1 = *s;
f65d3ee7 3575 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3576 c2 = PL_fold[c1];
f65d3ee7 3577 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3578 c2 = PL_fold_locale[c1];
1aa99e6b 3579 }
5f80c4cf 3580 else { /* UTF */
f65d3ee7 3581 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3582 STRLEN ulen1, ulen2;
e7ae6809
JH
3583 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3584 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3585
3586 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3587 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3588
872c91ae
JH
3589 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3590 ckWARN(WARN_UTF8) ?
3591 0 : UTF8_ALLOW_ANY);
3592 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3593 ckWARN(WARN_UTF8) ?
3594 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3595 }
3596 else {
872c91ae
JH
3597 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3598 ckWARN(WARN_UTF8) ?
3599 0 : UTF8_ALLOW_ANY);
5f80c4cf 3600 }
1aa99e6b
IH
3601 }
3602 }
bbce6d69 3603 }
a0d0e21e 3604 else
bbce6d69 3605 c1 = c2 = -1000;
cca55fe3 3606 assume_ok_easy:
3280af22 3607 PL_reginput = locinput;
a0d0e21e 3608 if (minmod) {
c277df42 3609 CHECKPOINT lastcp;
a0d0e21e
LW
3610 minmod = 0;
3611 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3612 sayNO;
a0ed51b3 3613 locinput = PL_reginput;
02db2b7b 3614 REGCP_SET(lastcp);
0fe9bf95 3615 if (c1 != -1000) {
1aa99e6b 3616 char *e; /* Should not check after this */
0fe9bf95 3617 char *old = locinput;
b2f2f093 3618 int count = 0;
0fe9bf95 3619
1aa99e6b 3620 if (n == REG_INFTY) {
0fe9bf95 3621 e = PL_regeol - 1;
1aa99e6b
IH
3622 if (do_utf8)
3623 while (UTF8_IS_CONTINUATION(*(U8*)e))
3624 e--;
3625 }
3626 else if (do_utf8) {
3627 int m = n - ln;
3628 for (e = locinput;
3629 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3630 e += UTF8SKIP(e);
3631 }
3632 else {
3633 e = locinput + n - ln;
3634 if (e >= PL_regeol)
3635 e = PL_regeol - 1;
3636 }
0fe9bf95
IZ
3637 while (1) {
3638 /* Find place 'next' could work */
1aa99e6b
IH
3639 if (!do_utf8) {
3640 if (c1 == c2) {
a8e8ab15
JH
3641 while (locinput <= e &&
3642 UCHARAT(locinput) != c1)
1aa99e6b
IH
3643 locinput++;
3644 } else {
9041c2e3 3645 while (locinput <= e
a8e8ab15
JH
3646 && UCHARAT(locinput) != c1
3647 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3648 locinput++;
3649 }
3650 count = locinput - old;
3651 }
3652 else {
3653 STRLEN len;
3654 if (c1 == c2) {
872c91ae
JH
3655 /* count initialised to
3656 * utf8_distance(old, locinput) */
b2f2f093 3657 while (locinput <= e &&
872c91ae
JH
3658 utf8n_to_uvchr((U8*)locinput,
3659 UTF8_MAXLEN, &len,
3660 ckWARN(WARN_UTF8) ?
eb160463 3661 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3662 locinput += len;
b2f2f093
JH
3663 count++;
3664 }
1aa99e6b 3665 } else {
872c91ae
JH
3666 /* count initialised to
3667 * utf8_distance(old, locinput) */
b2f2f093 3668 while (locinput <= e) {
872c91ae
JH
3669 UV c = utf8n_to_uvchr((U8*)locinput,
3670 UTF8_MAXLEN, &len,
3671 ckWARN(WARN_UTF8) ?
3672 0 : UTF8_ALLOW_ANY);
eb160463 3673 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3674 break;
b2f2f093
JH
3675 locinput += len;
3676 count++;
1aa99e6b
IH
3677 }
3678 }
0fe9bf95 3679 }
9041c2e3 3680 if (locinput > e)
0fe9bf95
IZ
3681 sayNO;
3682 /* PL_reginput == old now */
3683 if (locinput != old) {
3684 ln = 1; /* Did some */
1aa99e6b 3685 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3686 sayNO;
3687 }
3688 /* PL_reginput == locinput now */
29d1e993 3689 TRYPAREN(paren, ln, locinput);
0fe9bf95 3690 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3691 REGCP_UNWIND(lastcp);
0fe9bf95 3692 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3693 old = locinput;
3694 if (do_utf8)
3695 locinput += UTF8SKIP(locinput);
3696 else
3697 locinput++;
b2f2f093 3698 count = 1;
0fe9bf95
IZ
3699 }
3700 }
3701 else
c277df42 3702 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3703 UV c;
3704 if (c1 != -1000) {
3705 if (do_utf8)
872c91ae
JH
3706 c = utf8n_to_uvchr((U8*)PL_reginput,
3707 UTF8_MAXLEN, 0,
3708 ckWARN(WARN_UTF8) ?
3709 0 : UTF8_ALLOW_ANY);
1aa99e6b 3710 else
9041c2e3 3711 c = UCHARAT(PL_reginput);
2390ecbc 3712 /* If it could work, try it. */
eb160463 3713 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3714 {
ecc99935 3715 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3716 REGCP_UNWIND(lastcp);
3717 }
1aa99e6b 3718 }
a0d0e21e 3719 /* If it could work, try it. */
2390ecbc 3720 else if (c1 == -1000)
bbce6d69 3721 {
ecc99935 3722 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3723 REGCP_UNWIND(lastcp);
bbce6d69 3724 }
c277df42 3725 /* Couldn't or didn't -- move forward. */
a0ed51b3 3726 PL_reginput = locinput;
a0d0e21e
LW
3727 if (regrepeat(scan, 1)) {
3728 ln++;
a0ed51b3
LW
3729 locinput = PL_reginput;
3730 }
3731 else
4633a7c4 3732 sayNO;
a0d0e21e
LW
3733 }
3734 }
3735 else {
c277df42 3736 CHECKPOINT lastcp;
a0d0e21e 3737 n = regrepeat(scan, n);
a0ed51b3 3738 locinput = PL_reginput;
22c35a8c 3739 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
7fba1cd6 3740 (OP(next) != MEOL ||
15272685
HS
3741 OP(next) == SEOL || OP(next) == EOS))
3742 {
a0d0e21e 3743 ln = n; /* why back off? */
1aeab75a
GS
3744 /* ...because $ and \Z can match before *and* after
3745 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3746 We should back off by one in this case. */
3747 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3748 ln--;
3749 }
02db2b7b 3750 REGCP_SET(lastcp);
c277df42 3751 if (paren) {
8fa7f367 3752 UV c = 0;
c277df42 3753 while (n >= ln) {
1aa99e6b
IH
3754 if (c1 != -1000) {
3755 if (do_utf8)
872c91ae
JH
3756 c = utf8n_to_uvchr((U8*)PL_reginput,
3757 UTF8_MAXLEN, 0,
3758 ckWARN(WARN_UTF8) ?
3759 0 : UTF8_ALLOW_ANY);
1aa99e6b 3760 else
9041c2e3 3761 c = UCHARAT(PL_reginput);
1aa99e6b 3762 }
c277df42 3763 /* If it could work, try it. */
eb160463 3764 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3765 {
29d1e993 3766 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3767 REGCP_UNWIND(lastcp);
c277df42
IZ
3768 }
3769 /* Couldn't or didn't -- back up. */
3770 n--;
dfe13c55 3771 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3772 }
a0ed51b3
LW
3773 }
3774 else {
8fa7f367 3775 UV c = 0;
c277df42 3776 while (n >= ln) {
1aa99e6b
IH
3777 if (c1 != -1000) {
3778 if (do_utf8)
872c91ae
JH
3779 c = utf8n_to_uvchr((U8*)PL_reginput,
3780 UTF8_MAXLEN, 0,
3781 ckWARN(WARN_UTF8) ?
3782 0 : UTF8_ALLOW_ANY);
1aa99e6b 3783 else
9041c2e3 3784 c = UCHARAT(PL_reginput);
1aa99e6b 3785 }
c277df42 3786 /* If it could work, try it. */
eb160463 3787 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3788 {
29d1e993 3789 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3790 REGCP_UNWIND(lastcp);
c277df42
IZ
3791 }
3792 /* Couldn't or didn't -- back up. */
3793 n--;
dfe13c55 3794 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3795 }
a0d0e21e
LW
3796 }
3797 }
4633a7c4 3798 sayNO;
c277df42 3799 break;
a0d0e21e 3800 case END:
0f5d15d6
IZ
3801 if (PL_reg_call_cc) {
3802 re_cc_state *cur_call_cc = PL_reg_call_cc;
3803 CURCUR *cctmp = PL_regcc;
3804 regexp *re = PL_reg_re;
3805 CHECKPOINT cp, lastcp;
3806
3807 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3808 REGCP_SET(lastcp);
0f5d15d6
IZ
3809 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3810 the caller. */
3811 PL_reginput = locinput; /* Make position available to
3812 the callcc. */
3813 cache_re(PL_reg_call_cc->re);
3814 PL_regcc = PL_reg_call_cc->cc;
3815 PL_reg_call_cc = PL_reg_call_cc->prev;
3816 if (regmatch(cur_call_cc->node)) {
3817 PL_reg_call_cc = cur_call_cc;
3818 regcpblow(cp);
3819 sayYES;
3820 }
02db2b7b 3821 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3822 regcppop();
3823 PL_reg_call_cc = cur_call_cc;
3824 PL_regcc = cctmp;
3825 PL_reg_re = re;
3826 cache_re(re);
3827
3828 DEBUG_r(
3829 PerlIO_printf(Perl_debug_log,
3830 "%*s continuation failed...\n",
3831 REPORT_CODE_OFF+PL_regindent*2, "")
3832 );
7821416a 3833 sayNO_SILENT;
0f5d15d6 3834 }
7821416a
IZ
3835 if (locinput < PL_regtill) {
3836 DEBUG_r(PerlIO_printf(Perl_debug_log,
3837 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3838 PL_colors[4],
3839 (long)(locinput - PL_reg_starttry),
3840 (long)(PL_regtill - PL_reg_starttry),
3841 PL_colors[5]));
3842 sayNO_FINAL; /* Cannot match: too short. */
3843 }
3844 PL_reginput = locinput; /* put where regtry can find it */
3845 sayYES_FINAL; /* Success! */
7e5428c5 3846 case SUCCEED:
3280af22 3847 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3848 sayYES_LOUD; /* Success! */
c277df42
IZ
3849 case SUSPEND:
3850 n = 1;
9fe1d20c 3851 PL_reginput = locinput;
9041c2e3 3852 goto do_ifmatch;
a0d0e21e 3853 case UNLESSM:
c277df42 3854 n = 0;
a0ed51b3 3855 if (scan->flags) {
efb30f32
HS
3856 s = HOPBACKc(locinput, scan->flags);
3857 if (!s)
3858 goto say_yes;
3859 PL_reginput = s;
a0ed51b3
LW
3860 }
3861 else
3862 PL_reginput = locinput;
c2b0868c 3863 PL_reg_flags ^= RF_false;
c277df42
IZ
3864 goto do_ifmatch;
3865 case IFMATCH:
3866 n = 1;
a0ed51b3 3867 if (scan->flags) {
efb30f32
HS
3868 s = HOPBACKc(locinput, scan->flags);
3869 if (!s)
3870 goto say_no;
3871 PL_reginput = s;
a0ed51b3
LW
3872 }
3873 else
3874 PL_reginput = locinput;
3875
c277df42 3876 do_ifmatch:
c277df42
IZ
3877 inner = NEXTOPER(NEXTOPER(scan));
3878 if (regmatch(inner) != n) {
c2b0868c
HS
3879 if (n == 0)
3880 PL_reg_flags ^= RF_false;
c277df42
IZ
3881 say_no:
3882 if (logical) {
3883 logical = 0;
3884 sw = 0;
3885 goto do_longjump;
a0ed51b3
LW
3886 }
3887 else
c277df42
IZ
3888 sayNO;
3889 }
c2b0868c
HS
3890 if (n == 0)
3891 PL_reg_flags ^= RF_false;
c277df42
IZ
3892 say_yes:
3893 if (logical) {
3894 logical = 0;
3895 sw = 1;
3896 }
fe44a5e8 3897 if (OP(scan) == SUSPEND) {
3280af22 3898 locinput = PL_reginput;
565764a8 3899 nextchr = UCHARAT(locinput);
fe44a5e8 3900 }
c277df42
IZ
3901 /* FALL THROUGH. */
3902 case LONGJMP:
3903 do_longjump:
3904 next = scan + ARG(scan);
3905 if (next == scan)
3906 next = NULL;
a0d0e21e
LW
3907 break;
3908 default:
b900a521 3909 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3910 PTR2UV(scan), OP(scan));
cea2e8a9 3911 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3912 }
02db2b7b 3913 reenter:
a0d0e21e
LW
3914 scan = next;
3915 }
a687059c 3916
a0d0e21e
LW
3917 /*
3918 * We get here only if there's trouble -- normally "case END" is
3919 * the terminating point.
3920 */
cea2e8a9 3921 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3922 /*NOTREACHED*/
4633a7c4
LW
3923 sayNO;
3924
7821416a
IZ
3925yes_loud:
3926 DEBUG_r(
3927 PerlIO_printf(Perl_debug_log,
3928 "%*s %scould match...%s\n",
3929 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3930 );
3931 goto yes;
3932yes_final:
3933 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3934 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3935yes:
3936#ifdef DEBUGGING
3280af22 3937 PL_regindent--;
4633a7c4 3938#endif
02db2b7b
IZ
3939
3940#if 0 /* Breaks $^R */
3941 if (unwind)
3942 regcpblow(firstcp);
3943#endif
4633a7c4
LW
3944 return 1;
3945
3946no:
7821416a
IZ
3947 DEBUG_r(
3948 PerlIO_printf(Perl_debug_log,
3949 "%*s %sfailed...%s\n",
3950 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3951 );
3952 goto do_no;
3953no_final:
3954do_no:
02db2b7b
IZ
3955 if (unwind) {
3956 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3957
3958 switch (uw->type) {
3959 case RE_UNWIND_BRANCH:
3960 case RE_UNWIND_BRANCHJ:
3961 {
3962 re_unwind_branch_t *uwb = &(uw->branch);
3963 I32 lastparen = uwb->lastparen;
9041c2e3 3964
02db2b7b
IZ
3965 REGCP_UNWIND(uwb->lastcp);
3966 for (n = *PL_reglastparen; n > lastparen; n--)
3967 PL_regendp[n] = -1;
3968 *PL_reglastparen = n;
3969 scan = next = uwb->next;
9041c2e3
NIS
3970 if ( !scan ||
3971 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3972 ? BRANCH : BRANCHJ) ) { /* Failure */
3973 unwind = uwb->prev;
3974#ifdef DEBUGGING
3975 PL_regindent--;
3976#endif
3977 goto do_no;
3978 }
3979 /* Have more choice yet. Reuse the same uwb. */
3980 /*SUPPRESS 560*/
3981 if ((n = (uwb->type == RE_UNWIND_BRANCH
3982 ? NEXT_OFF(next) : ARG(next))))
3983 next += n;
3984 else
3985 next = NULL; /* XXXX Needn't unwinding in this case... */
3986 uwb->next = next;
3987 next = NEXTOPER(scan);
3988 if (uwb->type == RE_UNWIND_BRANCHJ)
3989 next = NEXTOPER(next);
3990 locinput = uwb->locinput;
3991 nextchr = uwb->nextchr;
3992#ifdef DEBUGGING
3993 PL_regindent = uwb->regindent;
3994#endif
3995
3996 goto reenter;
3997 }
3998 /* NOT REACHED */
3999 default:
4000 Perl_croak(aTHX_ "regexp unwind memory corruption");
4001 }
4002 /* NOT REACHED */
4003 }
4633a7c4 4004#ifdef DEBUGGING
3280af22 4005 PL_regindent--;
4633a7c4 4006#endif
a0d0e21e 4007 return 0;
a687059c
LW
4008}
4009
4010/*
4011 - regrepeat - repeatedly match something simple, report how many
4012 */
4013/*
4014 * [This routine now assumes that it will only match on things of length 1.
4015 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4016 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4017 */
76e3520e 4018STATIC I32
cea2e8a9 4019S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4020{
a0d0e21e 4021 register char *scan;
a0d0e21e 4022 register I32 c;
3280af22 4023 register char *loceol = PL_regeol;
a0ed51b3 4024 register I32 hardcount = 0;
53c4c00c 4025 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4026
3280af22 4027 scan = PL_reginput;
faf11cac
HS
4028 if (max == REG_INFTY)
4029 max = I32_MAX;
4030 else if (max < loceol - scan)
a0d0e21e 4031 loceol = scan + max;
a0d0e21e 4032 switch (OP(p)) {
22c35a8c 4033 case REG_ANY:
1aa99e6b 4034 if (do_utf8) {
ffc61ed2 4035 loceol = PL_regeol;
1aa99e6b 4036 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4037 scan += UTF8SKIP(scan);
4038 hardcount++;
4039 }
4040 } else {
4041 while (scan < loceol && *scan != '\n')
4042 scan++;
a0ed51b3
LW
4043 }
4044 break;
ffc61ed2 4045 case SANY:
def8e4ea
JH
4046 if (do_utf8) {
4047 loceol = PL_regeol;
a0804c9e 4048 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4049 scan += UTF8SKIP(scan);
4050 hardcount++;
4051 }
4052 }
4053 else
4054 scan = loceol;
a0ed51b3 4055 break;
f33976b4
DB
4056 case CANY:
4057 scan = loceol;
4058 break;
090f7165
JH
4059 case EXACT: /* length of string is 1 */
4060 c = (U8)*STRING(p);
4061 while (scan < loceol && UCHARAT(scan) == c)
4062 scan++;
bbce6d69 4063 break;
4064 case EXACTF: /* length of string is 1 */
cd439c50 4065 c = (U8)*STRING(p);
bbce6d69 4066 while (scan < loceol &&
22c35a8c 4067 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4068 scan++;
4069 break;
4070 case EXACTFL: /* length of string is 1 */
3280af22 4071 PL_reg_flags |= RF_tainted;
cd439c50 4072 c = (U8)*STRING(p);
bbce6d69 4073 while (scan < loceol &&
22c35a8c 4074 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4075 scan++;
4076 break;
4077 case ANYOF:
ffc61ed2
JH
4078 if (do_utf8) {
4079 loceol = PL_regeol;
cfc92286
JH
4080 while (hardcount < max && scan < loceol &&
4081 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4082 scan += UTF8SKIP(scan);
4083 hardcount++;
4084 }
4085 } else {
7d3e948e 4086 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4087 scan++;
4088 }
a0d0e21e
LW
4089 break;
4090 case ALNUM:
1aa99e6b 4091 if (do_utf8) {
ffc61ed2 4092 loceol = PL_regeol;
8269fa76 4093 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4094 while (hardcount < max && scan < loceol &&
3568d838 4095 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4096 scan += UTF8SKIP(scan);
4097 hardcount++;
4098 }
4099 } else {
4100 while (scan < loceol && isALNUM(*scan))
4101 scan++;
a0ed51b3
LW
4102 }
4103 break;
bbce6d69 4104 case ALNUML:
3280af22 4105 PL_reg_flags |= RF_tainted;
1aa99e6b 4106 if (do_utf8) {
ffc61ed2 4107 loceol = PL_regeol;
1aa99e6b
IH
4108 while (hardcount < max && scan < loceol &&
4109 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4110 scan += UTF8SKIP(scan);
4111 hardcount++;
4112 }
4113 } else {
4114 while (scan < loceol && isALNUM_LC(*scan))
4115 scan++;
a0ed51b3
LW
4116 }
4117 break;
a0d0e21e 4118 case NALNUM:
1aa99e6b 4119 if (do_utf8) {
ffc61ed2 4120 loceol = PL_regeol;
8269fa76 4121 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4122 while (hardcount < max && scan < loceol &&
3568d838 4123 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4124 scan += UTF8SKIP(scan);
4125 hardcount++;
4126 }
4127 } else {
4128 while (scan < loceol && !isALNUM(*scan))
4129 scan++;
a0ed51b3
LW
4130 }
4131 break;
bbce6d69 4132 case NALNUML:
3280af22 4133 PL_reg_flags |= RF_tainted;
1aa99e6b 4134 if (do_utf8) {
ffc61ed2 4135 loceol = PL_regeol;
1aa99e6b
IH
4136 while (hardcount < max && scan < loceol &&
4137 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4138 scan += UTF8SKIP(scan);
4139 hardcount++;
4140 }
4141 } else {
4142 while (scan < loceol && !isALNUM_LC(*scan))
4143 scan++;
a0ed51b3
LW
4144 }
4145 break;
a0d0e21e 4146 case SPACE:
1aa99e6b 4147 if (do_utf8) {
ffc61ed2 4148 loceol = PL_regeol;
8269fa76 4149 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4150 while (hardcount < max && scan < loceol &&
3568d838
JH
4151 (*scan == ' ' ||
4152 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4153 scan += UTF8SKIP(scan);
4154 hardcount++;
4155 }
4156 } else {
4157 while (scan < loceol && isSPACE(*scan))
4158 scan++;
a0ed51b3
LW
4159 }
4160 break;
bbce6d69 4161 case SPACEL:
3280af22 4162 PL_reg_flags |= RF_tainted;
1aa99e6b 4163 if (do_utf8) {
ffc61ed2 4164 loceol = PL_regeol;
1aa99e6b 4165 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4166 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4167 scan += UTF8SKIP(scan);
4168 hardcount++;
4169 }
4170 } else {
4171 while (scan < loceol && isSPACE_LC(*scan))
4172 scan++;
a0ed51b3
LW
4173 }
4174 break;
a0d0e21e 4175 case NSPACE:
1aa99e6b 4176 if (do_utf8) {
ffc61ed2 4177 loceol = PL_regeol;
8269fa76 4178 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4179 while (hardcount < max && scan < loceol &&
3568d838
JH
4180 !(*scan == ' ' ||
4181 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4182 scan += UTF8SKIP(scan);
4183 hardcount++;
4184 }
4185 } else {
4186 while (scan < loceol && !isSPACE(*scan))
4187 scan++;
4188 break;
a0ed51b3 4189 }
bbce6d69 4190 case NSPACEL:
3280af22 4191 PL_reg_flags |= RF_tainted;
1aa99e6b 4192 if (do_utf8) {
ffc61ed2 4193 loceol = PL_regeol;
1aa99e6b 4194 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4195 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4196 scan += UTF8SKIP(scan);
4197 hardcount++;
4198 }
4199 } else {
4200 while (scan < loceol && !isSPACE_LC(*scan))
4201 scan++;
a0ed51b3
LW
4202 }
4203 break;
a0d0e21e 4204 case DIGIT:
1aa99e6b 4205 if (do_utf8) {
ffc61ed2 4206 loceol = PL_regeol;
8269fa76 4207 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4208 while (hardcount < max && scan < loceol &&
3568d838 4209 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4210 scan += UTF8SKIP(scan);
4211 hardcount++;
4212 }
4213 } else {
4214 while (scan < loceol && isDIGIT(*scan))
4215 scan++;
a0ed51b3
LW
4216 }
4217 break;
a0d0e21e 4218 case NDIGIT:
1aa99e6b 4219 if (do_utf8) {
ffc61ed2 4220 loceol = PL_regeol;
8269fa76 4221 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4222 while (hardcount < max && scan < loceol &&
3568d838 4223 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4224 scan += UTF8SKIP(scan);
4225 hardcount++;
4226 }
4227 } else {
4228 while (scan < loceol && !isDIGIT(*scan))
4229 scan++;
a0ed51b3
LW
4230 }
4231 break;
a0d0e21e
LW
4232 default: /* Called on something of 0 width. */
4233 break; /* So match right here or not at all. */
4234 }
a687059c 4235
a0ed51b3
LW
4236 if (hardcount)
4237 c = hardcount;
4238 else
4239 c = scan - PL_reginput;
3280af22 4240 PL_reginput = scan;
a687059c 4241
9041c2e3 4242 DEBUG_r(
c277df42
IZ
4243 {
4244 SV *prop = sv_newmortal();
4245
4246 regprop(prop, p);
9041c2e3
NIS
4247 PerlIO_printf(Perl_debug_log,
4248 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4249 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4250 });
9041c2e3 4251
a0d0e21e 4252 return(c);
a687059c
LW
4253}
4254
4255/*
c277df42 4256 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4257 *
0e788c72 4258 * The repeater is supposed to have constant non-zero length.
c277df42
IZ
4259 */
4260
76e3520e 4261STATIC I32
cea2e8a9 4262S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4263{
b7953727 4264 register char *scan = Nullch;
c277df42 4265 register char *start;
3280af22 4266 register char *loceol = PL_regeol;
a0ed51b3 4267 I32 l = 0;
708e3b05 4268 I32 count = 0, res = 1;
a0ed51b3
LW
4269
4270 if (!max)
4271 return 0;
c277df42 4272
3280af22 4273 start = PL_reginput;
53c4c00c 4274 if (PL_reg_match_utf8) {
708e3b05 4275 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4276 if (!count++) {
4277 l = 0;
4278 while (start < PL_reginput) {
4279 l++;
4280 start += UTF8SKIP(start);
4281 }
4282 *lp = l;
4283 if (l == 0)
4284 return max;
4285 }
4286 if (count == max)
4287 return count;
4288 }
4289 }
4290 else {
708e3b05 4291 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4292 if (!count++) {
4293 *lp = l = PL_reginput - start;
4294 if (max != REG_INFTY && l*max < loceol - scan)
4295 loceol = scan + l*max;
4296 if (l == 0)
4297 return max;
c277df42
IZ
4298 }
4299 }
4300 }
708e3b05 4301 if (!res)
3280af22 4302 PL_reginput = scan;
9041c2e3 4303
a0ed51b3 4304 return count;
c277df42
IZ
4305}
4306
4307/*
ffc61ed2
JH
4308- regclass_swash - prepare the utf8 swash
4309*/
4310
4311SV *
9e55ce06 4312Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4313{
9e55ce06
JH
4314 SV *sw = NULL;
4315 SV *si = NULL;
4316 SV *alt = NULL;
ffc61ed2
JH
4317
4318 if (PL_regdata && PL_regdata->count) {
4319 U32 n = ARG(node);
4320
4321 if (PL_regdata->what[n] == 's') {
4322 SV *rv = (SV*)PL_regdata->data[n];
4323 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4324 SV **ary = AvARRAY(av);
9e55ce06 4325 SV **a, **b;
9041c2e3 4326
9e55ce06
JH
4327 /* See the end of regcomp.c:S_reglass() for
4328 * documentation of these array elements. */
4329
b11f357e
JH
4330 si = *ary;
4331 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4332 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4333
ffc61ed2
JH
4334 if (a)
4335 sw = *a;
4336 else if (si && doinit) {
4337 sw = swash_init("utf8", "", si, 1, 0);
4338 (void)av_store(av, 1, sw);
4339 }
9e55ce06
JH
4340 if (b)
4341 alt = *b;
ffc61ed2
JH
4342 }
4343 }
4344
9e55ce06
JH
4345 if (listsvp)
4346 *listsvp = si;
4347 if (altsvp)
4348 *altsvp = alt;
ffc61ed2
JH
4349
4350 return sw;
4351}
4352
4353/*
ba7b4546 4354 - reginclass - determine if a character falls into a character class
832705d4
JH
4355
4356 The n is the ANYOF regnode, the p is the target string, lenp
4357 is pointer to the maximum length of how far to go in the p
4358 (if the lenp is zero, UTF8SKIP(p) is used),
4359 do_utf8 tells whether the target string is in UTF-8.
4360
bbce6d69 4361 */
4362
76e3520e 4363STATIC bool
ba7b4546 4364S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4365{
ffc61ed2 4366 char flags = ANYOF_FLAGS(n);
bbce6d69 4367 bool match = FALSE;
cc07378b 4368 UV c = *p;
ae9ddab8 4369 STRLEN len = 0;
9e55ce06 4370 STRLEN plen;
1aa99e6b 4371
ae9ddab8
JH
4372 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4373 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4374 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4375
0f0076b4 4376 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4377 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4378 if (lenp)
4379 *lenp = 0;
ffc61ed2 4380 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4381 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4382 match = TRUE;
bbce6d69 4383 }
3568d838 4384 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4385 match = TRUE;
ffc61ed2 4386 if (!match) {
9e55ce06
JH
4387 AV *av;
4388 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4389
4390 if (sw) {
3568d838 4391 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4392 match = TRUE;
4393 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4394 if (!match && lenp && av) {
4395 I32 i;
4396
4397 for (i = 0; i <= av_len(av); i++) {
4398 SV* sv = *av_fetch(av, i, FALSE);
4399 STRLEN len;
4400 char *s = SvPV(sv, len);
4401
061b10df 4402 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4403 *lenp = len;
4404 match = TRUE;
4405 break;
4406 }
4407 }
4408 }
4409 if (!match) {
4a623e43
JH
4410 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4411 STRLEN tmplen;
4412
9e55ce06
JH
4413 to_utf8_fold(p, tmpbuf, &tmplen);
4414 if (swash_fetch(sw, tmpbuf, do_utf8))
4415 match = TRUE;
4416 }
ffc61ed2
JH
4417 }
4418 }
bbce6d69 4419 }
9e55ce06 4420 if (match && lenp && *lenp == 0)
0f0076b4 4421 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4422 }
1aa99e6b 4423 if (!match && c < 256) {
ffc61ed2
JH
4424 if (ANYOF_BITMAP_TEST(n, c))
4425 match = TRUE;
4426 else if (flags & ANYOF_FOLD) {
eb160463 4427 U8 f;
a0ed51b3 4428
ffc61ed2
JH
4429 if (flags & ANYOF_LOCALE) {
4430 PL_reg_flags |= RF_tainted;
4431 f = PL_fold_locale[c];
4432 }
4433 else
4434 f = PL_fold[c];
4435 if (f != c && ANYOF_BITMAP_TEST(n, f))
4436 match = TRUE;
4437 }
4438
4439 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4440 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4441 if (
4442 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4443 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4444 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4445 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4446 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4447 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4448 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4449 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4450 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4451 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4452 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4453 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4454 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4455 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4456 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4457 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4458 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4459 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4460 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4472 ) /* How's that for a conditional? */
4473 {
4474 match = TRUE;
4475 }
a0ed51b3 4476 }
a0ed51b3
LW
4477 }
4478
a0ed51b3
LW
4479 return (flags & ANYOF_INVERT) ? !match : match;
4480}
161b471a 4481
dfe13c55 4482STATIC U8 *
cea2e8a9 4483S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4484{
1aa99e6b
IH
4485 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4486}
4487
4488STATIC U8 *
4489S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4490{
a0ed51b3 4491 if (off >= 0) {
1aa99e6b 4492 while (off-- && s < lim) {
ffc61ed2 4493 /* XXX could check well-formedness here */
a0ed51b3 4494 s += UTF8SKIP(s);
ffc61ed2 4495 }
a0ed51b3
LW
4496 }
4497 else {
4498 while (off++) {
1aa99e6b 4499 if (s > lim) {
a0ed51b3 4500 s--;
ffc61ed2 4501 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4502 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4503 s--;
ffc61ed2
JH
4504 }
4505 /* XXX could check well-formedness here */
a0ed51b3
LW
4506 }
4507 }
4508 }
4509 return s;
4510}
161b471a 4511
dfe13c55 4512STATIC U8 *
1aa99e6b 4513S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4514{
1aa99e6b
IH
4515 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4516}
4517
4518STATIC U8 *
4519S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4520{
4521 if (off >= 0) {
1aa99e6b 4522 while (off-- && s < lim) {
ffc61ed2 4523 /* XXX could check well-formedness here */
a0ed51b3 4524 s += UTF8SKIP(s);
ffc61ed2 4525 }
a0ed51b3
LW
4526 if (off >= 0)
4527 return 0;
4528 }
4529 else {
4530 while (off++) {
1aa99e6b 4531 if (s > lim) {
a0ed51b3 4532 s--;
ffc61ed2 4533 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4534 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4535 s--;
ffc61ed2
JH
4536 }
4537 /* XXX could check well-formedness here */
a0ed51b3
LW
4538 }
4539 else
4540 break;
4541 }
4542 if (off <= 0)
4543 return 0;
4544 }
4545 return s;
4546}
51371543 4547
51371543 4548static void
acfe0abc 4549restore_pos(pTHX_ void *arg)
51371543 4550{
51371543
GS
4551 if (PL_reg_eval_set) {
4552 if (PL_reg_oldsaved) {
4553 PL_reg_re->subbeg = PL_reg_oldsaved;
4554 PL_reg_re->sublen = PL_reg_oldsavedlen;
ed252734
NC
4555#ifdef PERL_COPY_ON_WRITE
4556 PL_reg_re->saved_copy = PL_nrs;
4557#endif
51371543
GS
4558 RX_MATCH_COPIED_on(PL_reg_re);
4559 }
4560 PL_reg_magic->mg_len = PL_reg_oldpos;
4561 PL_reg_eval_set = 0;
4562 PL_curpm = PL_reg_oldcurpm;
4563 }
4564}
33b8afdf
JH
4565
4566STATIC void
4567S_to_utf8_substr(pTHX_ register regexp *prog)
4568{
4569 SV* sv;
4570 if (prog->float_substr && !prog->float_utf8) {
4571 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4572 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4573 sv_utf8_upgrade(sv);
4574 if (SvTAIL(prog->float_substr))
4575 SvTAIL_on(sv);
4576 if (prog->float_substr == prog->check_substr)
4577 prog->check_utf8 = sv;
4578 }
4579 if (prog->anchored_substr && !prog->anchored_utf8) {
4580 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4581 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4582 sv_utf8_upgrade(sv);
4583 if (SvTAIL(prog->anchored_substr))
4584 SvTAIL_on(sv);
4585 if (prog->anchored_substr == prog->check_substr)
4586 prog->check_utf8 = sv;
4587 }
4588}
4589
4590STATIC void
4591S_to_byte_substr(pTHX_ register regexp *prog)
4592{
4593 SV* sv;
4594 if (prog->float_utf8 && !prog->float_substr) {
4595 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4596 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4597 if (sv_utf8_downgrade(sv, TRUE)) {
4598 if (SvTAIL(prog->float_utf8))
4599 SvTAIL_on(sv);
4600 } else {
4601 SvREFCNT_dec(sv);
4602 prog->float_substr = sv = &PL_sv_undef;
4603 }
4604 if (prog->float_utf8 == prog->check_utf8)
4605 prog->check_substr = sv;
4606 }
4607 if (prog->anchored_utf8 && !prog->anchored_substr) {
4608 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4609 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4610 if (sv_utf8_downgrade(sv, TRUE)) {
4611 if (SvTAIL(prog->anchored_utf8))
4612 SvTAIL_on(sv);
4613 } else {
4614 SvREFCNT_dec(sv);
4615 prog->anchored_substr = sv = &PL_sv_undef;
4616 }
4617 if (prog->anchored_utf8 == prog->check_utf8)
4618 prog->check_substr = sv;
4619 }
4620}