This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix "[perl #34643] Config_heavy.pl not in archlib"
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
f2b990bf 82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
a687059c
LW
86 *
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
90 */
91#include "EXTERN.h"
864dbfa3 92#define PERL_IN_REGEXEC_C
a687059c 93#include "perl.h"
0f5d15d6 94
a687059c
LW
95#include "regcomp.h"
96
c277df42
IZ
97#define RF_tainted 1 /* tainted information used? */
98#define RF_warned 2 /* warned about big count? */
ce862d02 99#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
100#define RF_utf8 8 /* String contains multibyte chars? */
101
eb160463 102#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
103
104#define RS_init 1 /* eval environment created */
105#define RS_set 2 /* replsv value is set */
c277df42 106
a687059c
LW
107#ifndef STATIC
108#define STATIC static
109#endif
110
ba7b4546 111#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 112
c277df42
IZ
113/*
114 * Forwards.
115 */
116
33b8afdf 117#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 118#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 119
dfe13c55
GS
120#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
121#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
122#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
123#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
124#define HOPc(pos,off) ((char*)HOP(pos,off))
125#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 126
efb30f32 127#define HOPBACK(pos, off) ( \
e54858b0 128 (PL_reg_match_utf8) \
efb30f32
HS
129 ? reghopmaybe((U8*)pos, -off) \
130 : (pos - off >= PL_bostr) \
131 ? (U8*)(pos - off) \
132 : (U8*)NULL \
133)
134#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
135
1aa99e6b
IH
136#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
137#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
138#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
140#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
141#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
142
bfed75c6 143#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((const U8*)b); LEAVE; } } STMT_END
51371543 144
5f80c4cf 145/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
146#define JUMPABLE(rn) ( \
147 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
148 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
149 OP(rn) == PLUS || OP(rn) == MINMOD || \
150 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
151)
152
cca55fe3
JP
153#define HAS_TEXT(rn) ( \
154 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
155)
e2d8ce26 156
a84d97b6
HS
157/*
158 Search for mandatory following text node; for lookahead, the text must
159 follow but for lookbehind (rn->flags != 0) we skip to the next step.
160*/
cca55fe3 161#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 162 while (JUMPABLE(rn)) \
a84d97b6 163 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 164 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
165 else if (OP(rn) == PLUS) \
166 rn = NEXTOPER(rn); \
a84d97b6
HS
167 else if (OP(rn) == IFMATCH) \
168 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 169 else rn += NEXT_OFF(rn); \
5f80c4cf 170} STMT_END
74750237 171
acfe0abc 172static void restore_pos(pTHX_ void *arg);
51371543 173
76e3520e 174STATIC CHECKPOINT
cea2e8a9 175S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 176{
3280af22 177 int retval = PL_savestack_ix;
b1ce53c5
JH
178#define REGCP_PAREN_ELEMS 4
179 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
180 int p;
181
e49a9654
IH
182 if (paren_elems_to_push < 0)
183 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
184
a01268b5 185#define REGCP_OTHER_ELEMS 6
4b3c1a47 186 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 187 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 188/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
189 SSPUSHINT(PL_regendp[p]);
190 SSPUSHINT(PL_regstartp[p]);
3280af22 191 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
192 SSPUSHINT(p);
193 }
b1ce53c5 194/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
195 SSPUSHINT(PL_regsize);
196 SSPUSHINT(*PL_reglastparen);
a01268b5 197 SSPUSHINT(*PL_reglastcloseparen);
3280af22 198 SSPUSHPTR(PL_reginput);
41123dfd
JH
199#define REGCP_FRAME_ELEMS 2
200/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
201 * are needed for the regexp context stack bookkeeping. */
202 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 203 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 204
a0d0e21e
LW
205 return retval;
206}
207
c277df42 208/* These are needed since we do not localize EVAL nodes: */
a3621e74 209# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 210 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 211 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 212
a3621e74 213# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 214 PerlIO_printf(Perl_debug_log, \
faccc32b 215 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 216 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 217
76e3520e 218STATIC char *
cea2e8a9 219S_regcppop(pTHX)
a0d0e21e 220{
b1ce53c5 221 I32 i;
a0d0e21e
LW
222 U32 paren = 0;
223 char *input;
cf93c79d 224 I32 tmps;
b1ce53c5 225
a3621e74
YO
226 GET_RE_DEBUG_FLAGS_DECL;
227
b1ce53c5 228 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 229 i = SSPOPINT;
b1ce53c5
JH
230 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
231 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 232 input = (char *) SSPOPPTR;
a01268b5 233 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
234 *PL_reglastparen = SSPOPINT;
235 PL_regsize = SSPOPINT;
b1ce53c5
JH
236
237 /* Now restore the parentheses context. */
41123dfd
JH
238 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
239 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 240 paren = (U32)SSPOPINT;
3280af22 241 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
242 PL_regstartp[paren] = SSPOPINT;
243 tmps = SSPOPINT;
3280af22
NIS
244 if (paren <= *PL_reglastparen)
245 PL_regendp[paren] = tmps;
a3621e74 246 DEBUG_EXECUTE_r(
c3464db5 247 PerlIO_printf(Perl_debug_log,
b900a521 248 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 249 (UV)paren, (IV)PL_regstartp[paren],
b900a521 250 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 251 (IV)PL_regendp[paren],
3280af22 252 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 253 );
a0d0e21e 254 }
a3621e74 255 DEBUG_EXECUTE_r(
eb160463 256 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 257 PerlIO_printf(Perl_debug_log,
faccc32b
JH
258 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
259 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
260 }
261 );
daf18116 262#if 1
dafc8851
JH
263 /* It would seem that the similar code in regtry()
264 * already takes care of this, and in fact it is in
265 * a better location to since this code can #if 0-ed out
266 * but the code in regtry() is needed or otherwise tests
267 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
268 * (as of patchlevel 7877) will fail. Then again,
269 * this code seems to be necessary or otherwise
270 * building DynaLoader will fail:
271 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
272 * --jhi */
eb160463
GS
273 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
274 if ((I32)paren > PL_regsize)
cf93c79d
IZ
275 PL_regstartp[paren] = -1;
276 PL_regendp[paren] = -1;
a0d0e21e 277 }
dafc8851 278#endif
a0d0e21e
LW
279 return input;
280}
281
0f5d15d6 282STATIC char *
cea2e8a9 283S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
284{
285 I32 tmp = PL_savestack_ix;
286
287 PL_savestack_ix = ss;
288 regcppop();
289 PL_savestack_ix = tmp;
942e002e 290 return Nullch;
0f5d15d6
IZ
291}
292
293typedef struct re_cc_state
294{
295 I32 ss;
296 regnode *node;
297 struct re_cc_state *prev;
298 CURCUR *cc;
299 regexp *re;
300} re_cc_state;
301
02db2b7b 302#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 303
29d1e993
HS
304#define TRYPAREN(paren, n, input) { \
305 if (paren) { \
306 if (n) { \
307 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
308 PL_regendp[paren] = input - PL_bostr; \
309 } \
310 else \
311 PL_regendp[paren] = -1; \
312 } \
313 if (regmatch(next)) \
314 sayYES; \
315 if (paren && n) \
316 PL_regendp[paren] = -1; \
317}
318
319
a687059c 320/*
e50aee73 321 * pregexec and friends
a687059c
LW
322 */
323
324/*
c277df42 325 - pregexec - match a regexp against a string
a687059c 326 */
c277df42 327I32
864dbfa3 328Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 329 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
330/* strend: pointer to null at end of string */
331/* strbeg: real beginning of string */
332/* minend: end of match must be >=minend after stringarg. */
333/* nosave: For optimizations. */
334{
335 return
9041c2e3 336 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
337 nosave ? 0 : REXEC_COPY_STR);
338}
0f5d15d6
IZ
339
340STATIC void
cea2e8a9 341S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
342{
343 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
344#ifdef DEBUGGING
345 PL_regprogram = prog->program;
346#endif
347 PL_regnpar = prog->nparens;
9041c2e3
NIS
348 PL_regdata = prog->data;
349 PL_reg_re = prog;
0f5d15d6 350}
22e551b9 351
9041c2e3 352/*
cad2e5aa
JH
353 * Need to implement the following flags for reg_anch:
354 *
355 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
356 * USE_INTUIT_ML
357 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
358 * INTUIT_AUTORITATIVE_ML
359 * INTUIT_ONCE_NOML - Intuit can match in one location only.
360 * INTUIT_ONCE_ML
361 *
362 * Another flag for this function: SECOND_TIME (so that float substrs
363 * with giant delta may be not rechecked).
364 */
365
366/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
367
2c2d71f5 368/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
369 Otherwise, only SvCUR(sv) is used to get strbeg. */
370
371/* XXXX We assume that strpos is strbeg unless sv. */
372
6eb5f6b9
JH
373/* XXXX Some places assume that there is a fixed substring.
374 An update may be needed if optimizer marks as "INTUITable"
375 RExen without fixed substrings. Similarly, it is assumed that
376 lengths of all the strings are no more than minlen, thus they
377 cannot come from lookahead.
378 (Or minlen should take into account lookahead.) */
379
2c2d71f5
JH
380/* A failure to find a constant substring means that there is no need to make
381 an expensive call to REx engine, thus we celebrate a failure. Similarly,
382 finding a substring too deep into the string means that less calls to
30944b6d
IZ
383 regtry() should be needed.
384
385 REx compiler's optimizer found 4 possible hints:
386 a) Anchored substring;
387 b) Fixed substring;
388 c) Whether we are anchored (beginning-of-line or \G);
389 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 390 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
391 string which does not contradict any of them.
392 */
2c2d71f5 393
6eb5f6b9
JH
394/* Most of decisions we do here should have been done at compile time.
395 The nodes of the REx which we used for the search should have been
396 deleted from the finite automaton. */
397
cad2e5aa
JH
398char *
399Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
400 char *strend, U32 flags, re_scream_pos_data *data)
401{
b7953727 402 register I32 start_shift = 0;
cad2e5aa 403 /* Should be nonnegative! */
b7953727 404 register I32 end_shift = 0;
2c2d71f5
JH
405 register char *s;
406 register SV *check;
a1933d95 407 char *strbeg;
cad2e5aa 408 char *t;
33b8afdf 409 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 410 I32 ml_anch;
6eb5f6b9 411 register char *other_last = Nullch; /* other substr checked before this */
b7953727 412 char *check_at = Nullch; /* check substr found at this pos */
7fba1cd6 413 I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d
IZ
414#ifdef DEBUGGING
415 char *i_strpos = strpos;
ce333219 416 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 417#endif
a3621e74
YO
418
419 GET_RE_DEBUG_FLAGS_DECL;
420
a30b2f1f 421 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 422
b8d68ded 423 if (prog->reganch & ROPT_UTF8) {
a3621e74 424 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
425 "UTF-8 regex...\n"));
426 PL_reg_flags |= RF_utf8;
427 }
428
a3621e74 429 DEBUG_EXECUTE_r({
b8d68ded 430 char *s = PL_reg_match_utf8 ?
c728cb41
JH
431 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
432 strpos;
b8d68ded
JH
433 int len = PL_reg_match_utf8 ?
434 strlen(s) : strend - strpos;
2a782b5b
JH
435 if (!PL_colorset)
436 reginitcolors();
b8d68ded 437 if (PL_reg_match_utf8)
a3621e74 438 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 439 "UTF-8 target...\n"));
2a782b5b
JH
440 PerlIO_printf(Perl_debug_log,
441 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
442 PL_colors[4],PL_colors[5],PL_colors[0],
443 prog->precomp,
444 PL_colors[1],
445 (strlen(prog->precomp) > 60 ? "..." : ""),
446 PL_colors[0],
447 (int)(len > 60 ? 60 : len),
448 s, PL_colors[1],
449 (len > 60 ? "..." : "")
450 );
451 });
cad2e5aa 452
c344f387
JH
453 /* CHR_DIST() would be more correct here but it makes things slow. */
454 if (prog->minlen > strend - strpos) {
a3621e74 455 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 456 "String too short... [re_intuit_start]\n"));
cad2e5aa 457 goto fail;
2c2d71f5 458 }
a1933d95 459 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 460 PL_regeol = strend;
33b8afdf
JH
461 if (do_utf8) {
462 if (!prog->check_utf8 && prog->check_substr)
463 to_utf8_substr(prog);
464 check = prog->check_utf8;
465 } else {
466 if (!prog->check_substr && prog->check_utf8)
467 to_byte_substr(prog);
468 check = prog->check_substr;
469 }
470 if (check == &PL_sv_undef) {
a3621e74 471 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
472 "Non-utf string cannot match utf check string\n"));
473 goto fail;
474 }
2c2d71f5 475 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
476 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
477 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 478 && !multiline ) ); /* Check after \n? */
cad2e5aa 479
7e25d62c
JH
480 if (!ml_anch) {
481 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
482 | ROPT_IMPLICIT)) /* not a real BOL */
483 /* SvCUR is not set on references: SvRV and SvPVX overlap */
484 && sv && !SvROK(sv)
485 && (strpos != strbeg)) {
a3621e74 486 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
487 goto fail;
488 }
489 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 490 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 491 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
492 I32 slen;
493
1aa99e6b 494 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
495 if (SvTAIL(check)) {
496 slen = SvCUR(check); /* >= 1 */
cad2e5aa 497
9041c2e3 498 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 499 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 500 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 501 goto fail_finish;
cad2e5aa
JH
502 }
503 /* Now should match s[0..slen-2] */
504 slen--;
653099ff 505 if (slen && (*SvPVX(check) != *s
cad2e5aa 506 || (slen > 1
653099ff 507 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5 508 report_neq:
a3621e74 509 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
510 goto fail_finish;
511 }
cad2e5aa 512 }
653099ff
GS
513 else if (*SvPVX(check) != *s
514 || ((slen = SvCUR(check)) > 1
515 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
516 goto report_neq;
517 goto success_at_start;
7e25d62c 518 }
cad2e5aa 519 }
2c2d71f5 520 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 521 s = strpos;
2c2d71f5 522 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 523 end_shift = prog->minlen - start_shift -
653099ff 524 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 525 if (!ml_anch) {
653099ff
GS
526 I32 end = prog->check_offset_max + CHR_SVLEN(check)
527 - (SvTAIL(check) != 0);
1aa99e6b 528 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
529
530 if (end_shift < eshift)
531 end_shift = eshift;
532 }
cad2e5aa 533 }
2c2d71f5 534 else { /* Can match at random position */
cad2e5aa
JH
535 ml_anch = 0;
536 s = strpos;
2c2d71f5
JH
537 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
538 /* Should be nonnegative! */
539 end_shift = prog->minlen - start_shift -
653099ff 540 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
541 }
542
2c2d71f5 543#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 544 if (end_shift < 0)
6bbae5e6 545 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
546#endif
547
2c2d71f5
JH
548 restart:
549 /* Find a possible match in the region s..strend by looking for
550 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 551 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
552 I32 p = -1; /* Internal iterator of scream. */
553 I32 *pp = data ? data->scream_pos : &p;
554
2c2d71f5
JH
555 if (PL_screamfirst[BmRARE(check)] >= 0
556 || ( BmRARE(check) == '\n'
557 && (BmPREVIOUS(check) == SvCUR(check) - 1)
558 && SvTAIL(check) ))
9041c2e3 559 s = screaminstr(sv, check,
2c2d71f5 560 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 561 else
2c2d71f5 562 goto fail_finish;
4addbd3b
HS
563 /* we may be pointing at the wrong string */
564 if (s && RX_MATCH_COPIED(prog))
7ef91622 565 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
566 if (data)
567 *data->scream_olds = s;
568 }
f33976b4 569 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
570 s = fbm_instr((U8*)(s + start_shift),
571 (U8*)(strend - end_shift),
7fba1cd6 572 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 573 else
1aa99e6b
IH
574 s = fbm_instr(HOP3(s, start_shift, strend),
575 HOP3(strend, -end_shift, strbeg),
7fba1cd6 576 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
577
578 /* Update the count-of-usability, remove useless subpatterns,
579 unshift s. */
2c2d71f5 580
a3621e74 581 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
2c2d71f5 582 (s ? "Found" : "Did not find"),
33b8afdf 583 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 584 PL_colors[0],
7b0972df
JH
585 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
586 SvPVX(check),
2c2d71f5
JH
587 PL_colors[1], (SvTAIL(check) ? "$" : ""),
588 (s ? " at offset " : "...\n") ) );
589
590 if (!s)
591 goto fail_finish;
592
6eb5f6b9
JH
593 check_at = s;
594
2c2d71f5 595 /* Finish the diagnostic message */
a3621e74 596 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
597
598 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
599 Start with the other substr.
600 XXXX no SCREAM optimization yet - and a very coarse implementation
601 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
602 *always* match. Probably should be marked during compile...
603 Probably it is right to do no SCREAM here...
604 */
605
33b8afdf 606 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 607 /* Take into account the "other" substring. */
2c2d71f5
JH
608 /* XXXX May be hopelessly wrong for UTF... */
609 if (!other_last)
6eb5f6b9 610 other_last = strpos;
33b8afdf 611 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
612 do_other_anchored:
613 {
1aa99e6b 614 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 615 char *s1 = s;
33b8afdf 616 SV* must;
2c2d71f5 617
2c2d71f5
JH
618 t = s - prog->check_offset_max;
619 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 620 && (!do_utf8
1aa99e6b 621 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 622 && t > strpos)))
30944b6d 623 /* EMPTY */;
2c2d71f5
JH
624 else
625 t = strpos;
1aa99e6b 626 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
627 if (t < other_last) /* These positions already checked */
628 t = other_last;
1aa99e6b 629 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
630 if (last < last1)
631 last1 = last;
632 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
633 /* On end-of-str: see comment below. */
33b8afdf
JH
634 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
635 if (must == &PL_sv_undef) {
636 s = (char*)NULL;
a3621e74 637 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
638 }
639 else
640 s = fbm_instr(
641 (unsigned char*)t,
642 HOP3(HOP3(last1, prog->anchored_offset, strend)
643 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
644 must,
7fba1cd6 645 multiline ? FBMrf_MULTILINE : 0
33b8afdf 646 );
a3621e74 647 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1aa99e6b 648 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
649 (s ? "Found" : "Contradicts"),
650 PL_colors[0],
33b8afdf
JH
651 (int)(SvCUR(must)
652 - (SvTAIL(must)!=0)),
653 SvPVX(must),
654 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
655 if (!s) {
656 if (last1 >= last2) {
a3621e74 657 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
658 ", giving up...\n"));
659 goto fail_finish;
660 }
a3621e74 661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 662 ", trying floating at offset %ld...\n",
1aa99e6b
IH
663 (long)(HOP3c(s1, 1, strend) - i_strpos)));
664 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
665 s = HOP3c(last, 1, strend);
2c2d71f5
JH
666 goto restart;
667 }
668 else {
a3621e74 669 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 670 (long)(s - i_strpos)));
1aa99e6b
IH
671 t = HOP3c(s, -prog->anchored_offset, strbeg);
672 other_last = HOP3c(s, 1, strend);
30944b6d 673 s = s1;
2c2d71f5
JH
674 if (t == strpos)
675 goto try_at_start;
2c2d71f5
JH
676 goto try_at_offset;
677 }
30944b6d 678 }
2c2d71f5
JH
679 }
680 else { /* Take into account the floating substring. */
33b8afdf
JH
681 char *last, *last1;
682 char *s1 = s;
683 SV* must;
684
685 t = HOP3c(s, -start_shift, strbeg);
686 last1 = last =
687 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
688 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
689 last = HOP3c(t, prog->float_max_offset, strend);
690 s = HOP3c(t, prog->float_min_offset, strend);
691 if (s < other_last)
692 s = other_last;
2c2d71f5 693 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
694 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
695 /* fbm_instr() takes into account exact value of end-of-str
696 if the check is SvTAIL(ed). Since false positives are OK,
697 and end-of-str is not later than strend we are OK. */
698 if (must == &PL_sv_undef) {
699 s = (char*)NULL;
a3621e74 700 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
701 }
702 else
2c2d71f5 703 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
704 (unsigned char*)last + SvCUR(must)
705 - (SvTAIL(must)!=0),
7fba1cd6 706 must, multiline ? FBMrf_MULTILINE : 0);
a3621e74 707 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
33b8afdf
JH
708 (s ? "Found" : "Contradicts"),
709 PL_colors[0],
710 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
711 SvPVX(must),
712 PL_colors[1], (SvTAIL(must) ? "$" : "")));
713 if (!s) {
714 if (last1 == last) {
a3621e74 715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
716 ", giving up...\n"));
717 goto fail_finish;
2c2d71f5 718 }
a3621e74 719 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
720 ", trying anchored starting at offset %ld...\n",
721 (long)(s1 + 1 - i_strpos)));
722 other_last = last;
723 s = HOP3c(t, 1, strend);
724 goto restart;
725 }
726 else {
a3621e74 727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
728 (long)(s - i_strpos)));
729 other_last = s; /* Fix this later. --Hugo */
730 s = s1;
731 if (t == strpos)
732 goto try_at_start;
733 goto try_at_offset;
734 }
2c2d71f5 735 }
cad2e5aa 736 }
2c2d71f5
JH
737
738 t = s - prog->check_offset_max;
2c2d71f5 739 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 740 && (!do_utf8
1aa99e6b
IH
741 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
742 && t > strpos))) {
2c2d71f5
JH
743 /* Fixed substring is found far enough so that the match
744 cannot start at strpos. */
745 try_at_offset:
cad2e5aa 746 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
747 /* Eventually fbm_*() should handle this, but often
748 anchored_offset is not 0, so this check will not be wasted. */
749 /* XXXX In the code below we prefer to look for "^" even in
750 presence of anchored substrings. And we search even
751 beyond the found float position. These pessimizations
752 are historical artefacts only. */
753 find_anchor:
2c2d71f5 754 while (t < strend - prog->minlen) {
cad2e5aa 755 if (*t == '\n') {
4ee3650e 756 if (t < check_at - prog->check_offset_min) {
33b8afdf 757 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
758 /* Since we moved from the found position,
759 we definitely contradict the found anchored
30944b6d
IZ
760 substr. Due to the above check we do not
761 contradict "check" substr.
762 Thus we can arrive here only if check substr
763 is float. Redo checking for "other"=="fixed".
764 */
9041c2e3 765 strpos = t + 1;
a3621e74 766 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
30944b6d
IZ
767 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
768 goto do_other_anchored;
769 }
4ee3650e
GS
770 /* We don't contradict the found floating substring. */
771 /* XXXX Why not check for STCLASS? */
cad2e5aa 772 s = t + 1;
a3621e74 773 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 774 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
775 goto set_useful;
776 }
4ee3650e
GS
777 /* Position contradicts check-string */
778 /* XXXX probably better to look for check-string
779 than for "\n", so one should lower the limit for t? */
a3621e74 780 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 781 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 782 other_last = strpos = s = t + 1;
cad2e5aa
JH
783 goto restart;
784 }
785 t++;
786 }
a3621e74 787 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
2c2d71f5
JH
788 PL_colors[0],PL_colors[1]));
789 goto fail_finish;
cad2e5aa 790 }
f5952150 791 else {
a3621e74 792 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
f5952150
GS
793 PL_colors[0],PL_colors[1]));
794 }
cad2e5aa
JH
795 s = t;
796 set_useful:
33b8afdf 797 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
798 }
799 else {
f5952150 800 /* The found string does not prohibit matching at strpos,
2c2d71f5 801 - no optimization of calling REx engine can be performed,
f5952150
GS
802 unless it was an MBOL and we are not after MBOL,
803 or a future STCLASS check will fail this. */
2c2d71f5
JH
804 try_at_start:
805 /* Even in this situation we may use MBOL flag if strpos is offset
806 wrt the start of the string. */
05b4157f 807 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 808 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
809 /* May be due to an implicit anchor of m{.*foo} */
810 && !(prog->reganch & ROPT_IMPLICIT))
811 {
cad2e5aa
JH
812 t = strpos;
813 goto find_anchor;
814 }
a3621e74 815 DEBUG_EXECUTE_r( if (ml_anch)
f5952150
GS
816 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
817 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 818 );
2c2d71f5 819 success_at_start:
30944b6d 820 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
821 && (do_utf8 ? (
822 prog->check_utf8 /* Could be deleted already */
823 && --BmUSEFUL(prog->check_utf8) < 0
824 && (prog->check_utf8 == prog->float_utf8)
825 ) : (
826 prog->check_substr /* Could be deleted already */
827 && --BmUSEFUL(prog->check_substr) < 0
828 && (prog->check_substr == prog->float_substr)
829 )))
66e933ab 830 {
cad2e5aa 831 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 832 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
833 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
834 if (do_utf8 ? prog->check_substr : prog->check_utf8)
835 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
836 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
837 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 838 check = Nullsv; /* abort */
cad2e5aa 839 s = strpos;
3cf5c195
IZ
840 /* XXXX This is a remnant of the old implementation. It
841 looks wasteful, since now INTUIT can use many
6eb5f6b9 842 other heuristics. */
cad2e5aa
JH
843 prog->reganch &= ~RE_USE_INTUIT;
844 }
845 else
846 s = strpos;
847 }
848
6eb5f6b9
JH
849 /* Last resort... */
850 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
851 if (prog->regstclass) {
852 /* minlen == 0 is possible if regstclass is \b or \B,
853 and the fixed substr is ''$.
854 Since minlen is already taken into account, s+1 is before strend;
855 accidentally, minlen >= 1 guaranties no false positives at s + 1
856 even for \b or \B. But (minlen? 1 : 0) below assumes that
857 regstclass does not come from lookahead... */
858 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
859 This leaves EXACTF only, which is dealt with in find_byclass(). */
06b5626a
AL
860 const U8* str = (U8*)STRING(prog->regstclass);
861 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 862 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 863 : 1);
33b8afdf 864 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 865 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 866 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
867 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
868 cl_l, strend)
869 : strend);
6eb5f6b9
JH
870
871 t = s;
9926ca43 872 cache_re(prog);
06b5626a 873 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
6eb5f6b9
JH
874 if (!s) {
875#ifdef DEBUGGING
e1ec3a88 876 const char *what = 0;
6eb5f6b9
JH
877#endif
878 if (endpos == strend) {
a3621e74 879 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
880 "Could not match STCLASS...\n") );
881 goto fail;
882 }
a3621e74 883 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 884 "This position contradicts STCLASS...\n") );
653099ff
GS
885 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
886 goto fail;
6eb5f6b9 887 /* Contradict one of substrings */
33b8afdf
JH
888 if (prog->anchored_substr || prog->anchored_utf8) {
889 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 890 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 891 hop_and_restart:
1aa99e6b 892 s = HOP3c(t, 1, strend);
66e933ab
GS
893 if (s + start_shift + end_shift > strend) {
894 /* XXXX Should be taken into account earlier? */
a3621e74 895 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
896 "Could not match STCLASS...\n") );
897 goto fail;
898 }
5e39e1e5
HS
899 if (!check)
900 goto giveup;
a3621e74 901 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 902 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
903 what, (long)(s + start_shift - i_strpos)) );
904 goto restart;
905 }
66e933ab 906 /* Have both, check_string is floating */
6eb5f6b9
JH
907 if (t + start_shift >= check_at) /* Contradicts floating=check */
908 goto retry_floating_check;
909 /* Recheck anchored substring, but not floating... */
9041c2e3 910 s = check_at;
5e39e1e5
HS
911 if (!check)
912 goto giveup;
a3621e74 913 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 914 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
915 (long)(other_last - i_strpos)) );
916 goto do_other_anchored;
917 }
60e71179
GS
918 /* Another way we could have checked stclass at the
919 current position only: */
920 if (ml_anch) {
921 s = t = t + 1;
5e39e1e5
HS
922 if (!check)
923 goto giveup;
a3621e74 924 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
925 "Looking for /%s^%s/m starting at offset %ld...\n",
926 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 927 goto try_at_offset;
66e933ab 928 }
33b8afdf 929 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 930 goto fail;
6eb5f6b9
JH
931 /* Check is floating subtring. */
932 retry_floating_check:
933 t = check_at - start_shift;
a3621e74 934 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
935 goto hop_and_restart;
936 }
b7953727 937 if (t != s) {
a3621e74 938 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 939 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
940 (long)(t - i_strpos), (long)(s - i_strpos))
941 );
942 }
943 else {
a3621e74 944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
945 "Does not contradict STCLASS...\n");
946 );
947 }
6eb5f6b9 948 }
5e39e1e5 949 giveup:
a3621e74 950 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
951 PL_colors[4], (check ? "Guessed" : "Giving up"),
952 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 953 return s;
2c2d71f5
JH
954
955 fail_finish: /* Substring not found */
33b8afdf
JH
956 if (prog->check_substr || prog->check_utf8) /* could be removed already */
957 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 958 fail:
a3621e74 959 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
960 PL_colors[4],PL_colors[5]));
961 return Nullch;
962}
9661b544 963
6eb5f6b9 964/* We know what class REx starts with. Try to find this position... */
3c3eec57 965STATIC char *
06b5626a 966S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, I32 norun)
a687059c 967{
6eb5f6b9
JH
968 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
969 char *m;
d8093b23 970 STRLEN ln;
5dab1207 971 STRLEN lnc;
078c425b 972 register STRLEN uskip;
d8093b23
G
973 unsigned int c1;
974 unsigned int c2;
6eb5f6b9
JH
975 char *e;
976 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 977 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 978
6eb5f6b9
JH
979 /* We know what class it must start with. */
980 switch (OP(c)) {
6eb5f6b9 981 case ANYOF:
388cc4de 982 if (do_utf8) {
078c425b 983 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
984 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
985 !UTF8_IS_INVARIANT((U8)s[0]) ?
986 reginclass(c, (U8*)s, 0, do_utf8) :
987 REGINCLASS(c, (U8*)s)) {
988 if (tmp && (norun || regtry(prog, s)))
989 goto got_it;
990 else
991 tmp = doevery;
992 }
993 else
994 tmp = 1;
078c425b 995 s += uskip;
388cc4de
HS
996 }
997 }
998 else {
999 while (s < strend) {
1000 STRLEN skip = 1;
1001
1002 if (REGINCLASS(c, (U8*)s) ||
1003 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1004 /* The assignment of 2 is intentional:
1005 * for the folded sharp s, the skip is 2. */
1006 (skip = SHARP_S_SKIP))) {
1007 if (tmp && (norun || regtry(prog, s)))
1008 goto got_it;
1009 else
1010 tmp = doevery;
1011 }
1012 else
1013 tmp = 1;
1014 s += skip;
1015 }
a0d0e21e 1016 }
6eb5f6b9 1017 break;
f33976b4
DB
1018 case CANY:
1019 while (s < strend) {
1020 if (tmp && (norun || regtry(prog, s)))
1021 goto got_it;
1022 else
1023 tmp = doevery;
1024 s++;
1025 }
1026 break;
6eb5f6b9 1027 case EXACTF:
5dab1207
NIS
1028 m = STRING(c);
1029 ln = STR_LEN(c); /* length to match in octets/bytes */
1030 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1031 if (UTF) {
a2a2844f 1032 STRLEN ulen1, ulen2;
5dab1207 1033 U8 *sm = (U8 *) m;
89ebb4a3
JH
1034 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1035 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
1036
1037 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1038 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1039
89ebb4a3 1040 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
872c91ae 1041 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
89ebb4a3 1042 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
872c91ae 1043 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1044 lnc = 0;
1045 while (sm < ((U8 *) m + ln)) {
1046 lnc++;
1047 sm += UTF8SKIP(sm);
1048 }
1aa99e6b
IH
1049 }
1050 else {
1051 c1 = *(U8*)m;
1052 c2 = PL_fold[c1];
1053 }
6eb5f6b9
JH
1054 goto do_exactf;
1055 case EXACTFL:
5dab1207
NIS
1056 m = STRING(c);
1057 ln = STR_LEN(c);
1058 lnc = (I32) ln;
d8093b23 1059 c1 = *(U8*)m;
6eb5f6b9
JH
1060 c2 = PL_fold_locale[c1];
1061 do_exactf:
db12adc6 1062 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1063
6eb5f6b9
JH
1064 if (norun && e < s)
1065 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1066
60a8b682
JH
1067 /* The idea in the EXACTF* cases is to first find the
1068 * first character of the EXACTF* node and then, if
1069 * necessary, case-insensitively compare the full
1070 * text of the node. The c1 and c2 are the first
1071 * characters (though in Unicode it gets a bit
1072 * more complicated because there are more cases
7f16dd3d
JH
1073 * than just upper and lower: one needs to use
1074 * the so-called folding case for case-insensitive
1075 * matching (called "loose matching" in Unicode).
1076 * ibcmp_utf8() will do just that. */
60a8b682 1077
1aa99e6b 1078 if (do_utf8) {
575cac57 1079 UV c, f;
89ebb4a3
JH
1080 U8 tmpbuf [UTF8_MAXBYTES+1];
1081 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
575cac57 1082 STRLEN len, foldlen;
d7f013c8 1083
09091399 1084 if (c1 == c2) {
5dab1207
NIS
1085 /* Upper and lower of 1st char are equal -
1086 * probably not a "letter". */
1aa99e6b 1087 while (s <= e) {
89ebb4a3 1088 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1089 ckWARN(WARN_UTF8) ?
1090 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1091 if ( c == c1
1092 && (ln == len ||
66423254 1093 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1094 m, (char **)0, ln, (bool)UTF))
55da9344 1095 && (norun || regtry(prog, s)) )
1aa99e6b 1096 goto got_it;
80aecb99
JH
1097 else {
1098 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1099 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1100 if ( f != c
1101 && (f == c1 || f == c2)
1102 && (ln == foldlen ||
66423254
JH
1103 !ibcmp_utf8((char *) foldbuf,
1104 (char **)0, foldlen, do_utf8,
d07ddd77 1105 m,
eb160463 1106 (char **)0, ln, (bool)UTF))
80aecb99
JH
1107 && (norun || regtry(prog, s)) )
1108 goto got_it;
1109 }
1aa99e6b
IH
1110 s += len;
1111 }
09091399
JH
1112 }
1113 else {
1aa99e6b 1114 while (s <= e) {
89ebb4a3 1115 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1116 ckWARN(WARN_UTF8) ?
1117 0 : UTF8_ALLOW_ANY);
80aecb99 1118
60a8b682 1119 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1120 * Note that not all the possible combinations
1121 * are handled here: some of them are handled
1122 * by the standard folding rules, and some of
1123 * them (the character class or ANYOF cases)
1124 * are handled during compiletime in
1125 * regexec.c:S_regclass(). */
880bd946
JH
1126 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1127 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1128 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1129
1130 if ( (c == c1 || c == c2)
1131 && (ln == len ||
66423254 1132 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1133 m, (char **)0, ln, (bool)UTF))
55da9344 1134 && (norun || regtry(prog, s)) )
1aa99e6b 1135 goto got_it;
80aecb99
JH
1136 else {
1137 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1138 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1139 if ( f != c
1140 && (f == c1 || f == c2)
1141 && (ln == foldlen ||
a6872d42 1142 !ibcmp_utf8((char *) foldbuf,
66423254 1143 (char **)0, foldlen, do_utf8,
d07ddd77 1144 m,
eb160463 1145 (char **)0, ln, (bool)UTF))
80aecb99
JH
1146 && (norun || regtry(prog, s)) )
1147 goto got_it;
1148 }
1aa99e6b
IH
1149 s += len;
1150 }
09091399 1151 }
1aa99e6b
IH
1152 }
1153 else {
1154 if (c1 == c2)
1155 while (s <= e) {
1156 if ( *(U8*)s == c1
1157 && (ln == 1 || !(OP(c) == EXACTF
1158 ? ibcmp(s, m, ln)
1159 : ibcmp_locale(s, m, ln)))
1160 && (norun || regtry(prog, s)) )
1161 goto got_it;
1162 s++;
1163 }
1164 else
1165 while (s <= e) {
1166 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1167 && (ln == 1 || !(OP(c) == EXACTF
1168 ? ibcmp(s, m, ln)
1169 : ibcmp_locale(s, m, ln)))
1170 && (norun || regtry(prog, s)) )
1171 goto got_it;
1172 s++;
1173 }
b3c9acc1
IZ
1174 }
1175 break;
bbce6d69 1176 case BOUNDL:
3280af22 1177 PL_reg_flags |= RF_tainted;
bbce6d69 1178 /* FALL THROUGH */
a0d0e21e 1179 case BOUND:
ffc61ed2 1180 if (do_utf8) {
12d33761 1181 if (s == PL_bostr)
ffc61ed2
JH
1182 tmp = '\n';
1183 else {
b4f7163a 1184 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1185
b4f7163a 1186 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1187 }
1188 tmp = ((OP(c) == BOUND ?
9041c2e3 1189 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1190 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1191 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1192 if (tmp == !(OP(c) == BOUND ?
3568d838 1193 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1194 isALNUM_LC_utf8((U8*)s)))
1195 {
1196 tmp = !tmp;
1197 if ((norun || regtry(prog, s)))
1198 goto got_it;
1199 }
078c425b 1200 s += uskip;
a687059c 1201 }
a0d0e21e 1202 }
667bb95a 1203 else {
12d33761 1204 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1205 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1206 while (s < strend) {
1207 if (tmp ==
1208 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1209 tmp = !tmp;
1210 if ((norun || regtry(prog, s)))
1211 goto got_it;
1212 }
1213 s++;
a0ed51b3 1214 }
a0ed51b3 1215 }
6eb5f6b9 1216 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1217 goto got_it;
1218 break;
bbce6d69 1219 case NBOUNDL:
3280af22 1220 PL_reg_flags |= RF_tainted;
bbce6d69 1221 /* FALL THROUGH */
a0d0e21e 1222 case NBOUND:
ffc61ed2 1223 if (do_utf8) {
12d33761 1224 if (s == PL_bostr)
ffc61ed2
JH
1225 tmp = '\n';
1226 else {
b4f7163a 1227 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1228
b4f7163a 1229 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1230 }
1231 tmp = ((OP(c) == NBOUND ?
9041c2e3 1232 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1233 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1234 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1235 if (tmp == !(OP(c) == NBOUND ?
3568d838 1236 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1237 isALNUM_LC_utf8((U8*)s)))
1238 tmp = !tmp;
1239 else if ((norun || regtry(prog, s)))
1240 goto got_it;
078c425b 1241 s += uskip;
ffc61ed2 1242 }
a0d0e21e 1243 }
667bb95a 1244 else {
12d33761 1245 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1246 tmp = ((OP(c) == NBOUND ?
1247 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1248 while (s < strend) {
1249 if (tmp ==
1250 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1251 tmp = !tmp;
1252 else if ((norun || regtry(prog, s)))
1253 goto got_it;
1254 s++;
1255 }
a0ed51b3 1256 }
6eb5f6b9 1257 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1258 goto got_it;
1259 break;
a0d0e21e 1260 case ALNUM:
ffc61ed2 1261 if (do_utf8) {
8269fa76 1262 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1263 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1264 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1265 if (tmp && (norun || regtry(prog, s)))
1266 goto got_it;
1267 else
1268 tmp = doevery;
1269 }
bbce6d69 1270 else
ffc61ed2 1271 tmp = 1;
078c425b 1272 s += uskip;
bbce6d69 1273 }
bbce6d69 1274 }
ffc61ed2
JH
1275 else {
1276 while (s < strend) {
1277 if (isALNUM(*s)) {
1278 if (tmp && (norun || regtry(prog, s)))
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
a0ed51b3 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s++;
a0ed51b3 1286 }
a0ed51b3
LW
1287 }
1288 break;
bbce6d69 1289 case ALNUML:
3280af22 1290 PL_reg_flags |= RF_tainted;
ffc61ed2 1291 if (do_utf8) {
078c425b 1292 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1293 if (isALNUM_LC_utf8((U8*)s)) {
1294 if (tmp && (norun || regtry(prog, s)))
1295 goto got_it;
1296 else
1297 tmp = doevery;
1298 }
a687059c 1299 else
ffc61ed2 1300 tmp = 1;
078c425b 1301 s += uskip;
a0d0e21e 1302 }
a0d0e21e 1303 }
ffc61ed2
JH
1304 else {
1305 while (s < strend) {
1306 if (isALNUM_LC(*s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a0ed51b3 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s++;
a0ed51b3 1315 }
a0ed51b3
LW
1316 }
1317 break;
a0d0e21e 1318 case NALNUM:
ffc61ed2 1319 if (do_utf8) {
8269fa76 1320 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1321 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1322 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1323 if (tmp && (norun || regtry(prog, s)))
1324 goto got_it;
1325 else
1326 tmp = doevery;
1327 }
bbce6d69 1328 else
ffc61ed2 1329 tmp = 1;
078c425b 1330 s += uskip;
bbce6d69 1331 }
bbce6d69 1332 }
ffc61ed2
JH
1333 else {
1334 while (s < strend) {
1335 if (!isALNUM(*s)) {
1336 if (tmp && (norun || regtry(prog, s)))
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0ed51b3 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s++;
a0ed51b3 1344 }
a0ed51b3
LW
1345 }
1346 break;
bbce6d69 1347 case NALNUML:
3280af22 1348 PL_reg_flags |= RF_tainted;
ffc61ed2 1349 if (do_utf8) {
078c425b 1350 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1351 if (!isALNUM_LC_utf8((U8*)s)) {
1352 if (tmp && (norun || regtry(prog, s)))
1353 goto got_it;
1354 else
1355 tmp = doevery;
1356 }
a687059c 1357 else
ffc61ed2 1358 tmp = 1;
078c425b 1359 s += uskip;
a687059c 1360 }
a0d0e21e 1361 }
ffc61ed2
JH
1362 else {
1363 while (s < strend) {
1364 if (!isALNUM_LC(*s)) {
1365 if (tmp && (norun || regtry(prog, s)))
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
a0ed51b3 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s++;
a0ed51b3 1373 }
a0ed51b3
LW
1374 }
1375 break;
a0d0e21e 1376 case SPACE:
ffc61ed2 1377 if (do_utf8) {
8269fa76 1378 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1379 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1380 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1381 if (tmp && (norun || regtry(prog, s)))
1382 goto got_it;
1383 else
1384 tmp = doevery;
1385 }
a0d0e21e 1386 else
ffc61ed2 1387 tmp = 1;
078c425b 1388 s += uskip;
2304df62 1389 }
a0d0e21e 1390 }
ffc61ed2
JH
1391 else {
1392 while (s < strend) {
1393 if (isSPACE(*s)) {
1394 if (tmp && (norun || regtry(prog, s)))
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0ed51b3 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s++;
a0ed51b3 1402 }
a0ed51b3
LW
1403 }
1404 break;
bbce6d69 1405 case SPACEL:
3280af22 1406 PL_reg_flags |= RF_tainted;
ffc61ed2 1407 if (do_utf8) {
078c425b 1408 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1409 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1410 if (tmp && (norun || regtry(prog, s)))
1411 goto got_it;
1412 else
1413 tmp = doevery;
1414 }
bbce6d69 1415 else
ffc61ed2 1416 tmp = 1;
078c425b 1417 s += uskip;
bbce6d69 1418 }
bbce6d69 1419 }
ffc61ed2
JH
1420 else {
1421 while (s < strend) {
1422 if (isSPACE_LC(*s)) {
1423 if (tmp && (norun || regtry(prog, s)))
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
a0ed51b3 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s++;
a0ed51b3 1431 }
a0ed51b3
LW
1432 }
1433 break;
a0d0e21e 1434 case NSPACE:
ffc61ed2 1435 if (do_utf8) {
8269fa76 1436 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1437 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1438 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1439 if (tmp && (norun || regtry(prog, s)))
1440 goto got_it;
1441 else
1442 tmp = doevery;
1443 }
a0d0e21e 1444 else
ffc61ed2 1445 tmp = 1;
078c425b 1446 s += uskip;
a687059c 1447 }
a0d0e21e 1448 }
ffc61ed2
JH
1449 else {
1450 while (s < strend) {
1451 if (!isSPACE(*s)) {
1452 if (tmp && (norun || regtry(prog, s)))
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0ed51b3 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s++;
a0ed51b3 1460 }
a0ed51b3
LW
1461 }
1462 break;
bbce6d69 1463 case NSPACEL:
3280af22 1464 PL_reg_flags |= RF_tainted;
ffc61ed2 1465 if (do_utf8) {
078c425b 1466 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1467 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1468 if (tmp && (norun || regtry(prog, s)))
1469 goto got_it;
1470 else
1471 tmp = doevery;
1472 }
bbce6d69 1473 else
ffc61ed2 1474 tmp = 1;
078c425b 1475 s += uskip;
bbce6d69 1476 }
bbce6d69 1477 }
ffc61ed2
JH
1478 else {
1479 while (s < strend) {
1480 if (!isSPACE_LC(*s)) {
1481 if (tmp && (norun || regtry(prog, s)))
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
a0ed51b3 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s++;
a0ed51b3 1489 }
a0ed51b3
LW
1490 }
1491 break;
a0d0e21e 1492 case DIGIT:
ffc61ed2 1493 if (do_utf8) {
8269fa76 1494 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1495 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1496 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1497 if (tmp && (norun || regtry(prog, s)))
1498 goto got_it;
1499 else
1500 tmp = doevery;
1501 }
a0d0e21e 1502 else
ffc61ed2 1503 tmp = 1;
078c425b 1504 s += uskip;
2b69d0c2 1505 }
a0d0e21e 1506 }
ffc61ed2
JH
1507 else {
1508 while (s < strend) {
1509 if (isDIGIT(*s)) {
1510 if (tmp && (norun || regtry(prog, s)))
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
a0ed51b3 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s++;
a0ed51b3 1518 }
a0ed51b3
LW
1519 }
1520 break;
b8c5462f
JH
1521 case DIGITL:
1522 PL_reg_flags |= RF_tainted;
ffc61ed2 1523 if (do_utf8) {
078c425b 1524 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1525 if (isDIGIT_LC_utf8((U8*)s)) {
1526 if (tmp && (norun || regtry(prog, s)))
1527 goto got_it;
1528 else
1529 tmp = doevery;
1530 }
b8c5462f 1531 else
ffc61ed2 1532 tmp = 1;
078c425b 1533 s += uskip;
b8c5462f 1534 }
b8c5462f 1535 }
ffc61ed2
JH
1536 else {
1537 while (s < strend) {
1538 if (isDIGIT_LC(*s)) {
1539 if (tmp && (norun || regtry(prog, s)))
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
b8c5462f 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s++;
b8c5462f 1547 }
b8c5462f
JH
1548 }
1549 break;
a0d0e21e 1550 case NDIGIT:
ffc61ed2 1551 if (do_utf8) {
8269fa76 1552 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1553 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1554 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1555 if (tmp && (norun || regtry(prog, s)))
1556 goto got_it;
1557 else
1558 tmp = doevery;
1559 }
a0d0e21e 1560 else
ffc61ed2 1561 tmp = 1;
078c425b 1562 s += uskip;
a687059c 1563 }
a0d0e21e 1564 }
ffc61ed2
JH
1565 else {
1566 while (s < strend) {
1567 if (!isDIGIT(*s)) {
1568 if (tmp && (norun || regtry(prog, s)))
1569 goto got_it;
1570 else
1571 tmp = doevery;
1572 }
a0ed51b3 1573 else
ffc61ed2
JH
1574 tmp = 1;
1575 s++;
a0ed51b3 1576 }
a0ed51b3
LW
1577 }
1578 break;
b8c5462f
JH
1579 case NDIGITL:
1580 PL_reg_flags |= RF_tainted;
ffc61ed2 1581 if (do_utf8) {
078c425b 1582 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1583 if (!isDIGIT_LC_utf8((U8*)s)) {
1584 if (tmp && (norun || regtry(prog, s)))
1585 goto got_it;
1586 else
1587 tmp = doevery;
1588 }
b8c5462f 1589 else
ffc61ed2 1590 tmp = 1;
078c425b 1591 s += uskip;
b8c5462f 1592 }
a0ed51b3 1593 }
ffc61ed2
JH
1594 else {
1595 while (s < strend) {
1596 if (!isDIGIT_LC(*s)) {
1597 if (tmp && (norun || regtry(prog, s)))
1598 goto got_it;
1599 else
1600 tmp = doevery;
1601 }
cf93c79d 1602 else
ffc61ed2
JH
1603 tmp = 1;
1604 s++;
b8c5462f 1605 }
b8c5462f
JH
1606 }
1607 break;
b3c9acc1 1608 default:
3c3eec57
GS
1609 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1610 break;
d6a28714 1611 }
6eb5f6b9
JH
1612 return 0;
1613 got_it:
1614 return s;
1615}
1616
1617/*
1618 - regexec_flags - match a regexp against a string
1619 */
1620I32
1621Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1622 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1623/* strend: pointer to null at end of string */
1624/* strbeg: real beginning of string */
1625/* minend: end of match must be >=minend after stringarg. */
1626/* data: May be used for some additional optimizations. */
1627/* nosave: For optimizations. */
1628{
6eb5f6b9
JH
1629 register char *s;
1630 register regnode *c;
1631 register char *startpos = stringarg;
6eb5f6b9
JH
1632 I32 minlen; /* must match at least this many chars */
1633 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1634 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1635 constant substr. */ /* CC */
1636 I32 end_shift = 0; /* Same for the end. */ /* CC */
1637 I32 scream_pos = -1; /* Internal iterator of scream. */
1638 char *scream_olds;
1639 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1640 bool do_utf8 = DO_UTF8(sv);
7fba1cd6 1641 I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1642#ifdef DEBUGGING
9e55ce06
JH
1643 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1644 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1645#endif
a3621e74
YO
1646
1647 GET_RE_DEBUG_FLAGS_DECL;
1648
a30b2f1f 1649 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1650
1651 PL_regcc = 0;
1652
1653 cache_re(prog);
1654#ifdef DEBUGGING
aea4f609 1655 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1656#endif
1657
1658 /* Be paranoid... */
1659 if (prog == NULL || startpos == NULL) {
1660 Perl_croak(aTHX_ "NULL regexp parameter");
1661 return 0;
1662 }
1663
1664 minlen = prog->minlen;
61a36c01 1665 if (strend - startpos < minlen) {
a3621e74 1666 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1667 "String too short [regexec_flags]...\n"));
1668 goto phooey;
1aa99e6b 1669 }
6eb5f6b9 1670
6eb5f6b9
JH
1671 /* Check validity of program. */
1672 if (UCHARAT(prog->program) != REG_MAGIC) {
1673 Perl_croak(aTHX_ "corrupted regexp program");
1674 }
1675
1676 PL_reg_flags = 0;
1677 PL_reg_eval_set = 0;
1678 PL_reg_maxiter = 0;
1679
1680 if (prog->reganch & ROPT_UTF8)
1681 PL_reg_flags |= RF_utf8;
1682
1683 /* Mark beginning of line for ^ and lookbehind. */
1684 PL_regbol = startpos;
1685 PL_bostr = strbeg;
1686 PL_reg_sv = sv;
1687
1688 /* Mark end of line for $ (and such) */
1689 PL_regeol = strend;
1690
1691 /* see how far we have to get to not match where we matched before */
1692 PL_regtill = startpos+minend;
1693
1694 /* We start without call_cc context. */
1695 PL_reg_call_cc = 0;
1696
1697 /* If there is a "must appear" string, look for it. */
1698 s = startpos;
1699
1700 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1701 MAGIC *mg;
1702
1703 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1704 PL_reg_ganch = startpos;
1705 else if (sv && SvTYPE(sv) >= SVt_PVMG
1706 && SvMAGIC(sv)
14befaf4
DM
1707 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1708 && mg->mg_len >= 0) {
6eb5f6b9
JH
1709 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1710 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1711 if (s > PL_reg_ganch)
6eb5f6b9
JH
1712 goto phooey;
1713 s = PL_reg_ganch;
1714 }
1715 }
1716 else /* pos() not defined */
1717 PL_reg_ganch = strbeg;
1718 }
1719
33b8afdf 1720 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1721 re_scream_pos_data d;
1722
1723 d.scream_olds = &scream_olds;
1724 d.scream_pos = &scream_pos;
1725 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1726 if (!s) {
a3621e74 1727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1728 goto phooey; /* not present */
3fa9c3d7 1729 }
6eb5f6b9
JH
1730 }
1731
a3621e74 1732 DEBUG_EXECUTE_r({
9e55ce06
JH
1733 char *s0 = UTF ?
1734 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1735 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1736 prog->precomp;
1737 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1738 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1739 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1740 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1741 if (!PL_colorset)
1742 reginitcolors();
1743 PerlIO_printf(Perl_debug_log,
9e55ce06 1744 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1745 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1746 len0, len0, s0,
2a782b5b 1747 PL_colors[1],
9e55ce06 1748 len0 > 60 ? "..." : "",
2a782b5b 1749 PL_colors[0],
9e55ce06
JH
1750 (int)(len1 > 60 ? 60 : len1),
1751 s1, PL_colors[1],
1752 (len1 > 60 ? "..." : "")
2a782b5b
JH
1753 );
1754 });
6eb5f6b9
JH
1755
1756 /* Simplest case: anchored match need be tried only once. */
1757 /* [unless only anchor is BOL and multiline is set] */
1758 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1759 if (s == startpos && regtry(prog, startpos))
1760 goto got_it;
7fba1cd6 1761 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1762 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1763 {
1764 char *end;
1765
1766 if (minlen)
1767 dontbother = minlen - 1;
1aa99e6b 1768 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1769 /* for multiline we only have to try after newlines */
33b8afdf 1770 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1771 if (s == startpos)
1772 goto after_try;
1773 while (1) {
1774 if (regtry(prog, s))
1775 goto got_it;
1776 after_try:
1777 if (s >= end)
1778 goto phooey;
1779 if (prog->reganch & RE_USE_INTUIT) {
1780 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1781 if (!s)
1782 goto phooey;
1783 }
1784 else
1785 s++;
1786 }
1787 } else {
1788 if (s > startpos)
1789 s--;
1790 while (s < end) {
1791 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1792 if (regtry(prog, s))
1793 goto got_it;
1794 }
1795 }
1796 }
1797 }
1798 goto phooey;
1799 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1800 if (regtry(prog, PL_reg_ganch))
1801 goto got_it;
1802 goto phooey;
1803 }
1804
1805 /* Messy cases: unanchored match. */
33b8afdf 1806 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1807 /* we have /x+whatever/ */
1808 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1809 char ch;
bf93d4cc
GS
1810#ifdef DEBUGGING
1811 int did_match = 0;
1812#endif
33b8afdf
JH
1813 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1814 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1815 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1816
1aa99e6b 1817 if (do_utf8) {
6eb5f6b9
JH
1818 while (s < strend) {
1819 if (*s == ch) {
a3621e74 1820 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1821 if (regtry(prog, s)) goto got_it;
1822 s += UTF8SKIP(s);
1823 while (s < strend && *s == ch)
1824 s += UTF8SKIP(s);
1825 }
1826 s += UTF8SKIP(s);
1827 }
1828 }
1829 else {
1830 while (s < strend) {
1831 if (*s == ch) {
a3621e74 1832 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1833 if (regtry(prog, s)) goto got_it;
1834 s++;
1835 while (s < strend && *s == ch)
1836 s++;
1837 }
1838 s++;
1839 }
1840 }
a3621e74 1841 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1842 PerlIO_printf(Perl_debug_log,
b7953727
JH
1843 "Did not find anchored character...\n")
1844 );
6eb5f6b9
JH
1845 }
1846 /*SUPPRESS 560*/
33b8afdf
JH
1847 else if (prog->anchored_substr != Nullsv
1848 || prog->anchored_utf8 != Nullsv
1849 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1850 && prog->float_max_offset < strend - s)) {
1851 SV *must;
1852 I32 back_max;
1853 I32 back_min;
1854 char *last;
6eb5f6b9 1855 char *last1; /* Last position checked before */
bf93d4cc
GS
1856#ifdef DEBUGGING
1857 int did_match = 0;
1858#endif
33b8afdf
JH
1859 if (prog->anchored_substr || prog->anchored_utf8) {
1860 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1861 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1862 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1863 back_max = back_min = prog->anchored_offset;
1864 } else {
1865 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1866 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1867 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1868 back_max = prog->float_max_offset;
1869 back_min = prog->float_min_offset;
1870 }
1871 if (must == &PL_sv_undef)
1872 /* could not downgrade utf8 check substring, so must fail */
1873 goto phooey;
1874
1875 last = HOP3c(strend, /* Cannot start after this */
1876 -(I32)(CHR_SVLEN(must)
1877 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1878
1879 if (s > PL_bostr)
1880 last1 = HOPc(s, -1);
1881 else
1882 last1 = s - 1; /* bogus */
1883
1884 /* XXXX check_substr already used to find `s', can optimize if
1885 check_substr==must. */
1886 scream_pos = -1;
1887 dontbother = end_shift;
1888 strend = HOPc(strend, -dontbother);
1889 while ( (s <= last) &&
9041c2e3 1890 ((flags & REXEC_SCREAM)
1aa99e6b 1891 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1892 end_shift, &scream_pos, 0))
1aa99e6b 1893 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1894 (unsigned char*)strend, must,
7fba1cd6 1895 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1896 /* we may be pointing at the wrong string */
1897 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1898 s = strbeg + (s - SvPVX(sv));
a3621e74 1899 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1900 if (HOPc(s, -back_max) > last1) {
1901 last1 = HOPc(s, -back_min);
1902 s = HOPc(s, -back_max);
1903 }
1904 else {
1905 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1906
1907 last1 = HOPc(s, -back_min);
1908 s = t;
1909 }
1aa99e6b 1910 if (do_utf8) {
6eb5f6b9
JH
1911 while (s <= last1) {
1912 if (regtry(prog, s))
1913 goto got_it;
1914 s += UTF8SKIP(s);
1915 }
1916 }
1917 else {
1918 while (s <= last1) {
1919 if (regtry(prog, s))
1920 goto got_it;
1921 s++;
1922 }
1923 }
1924 }
a3621e74 1925 DEBUG_EXECUTE_r(if (!did_match)
b7953727
JH
1926 PerlIO_printf(Perl_debug_log,
1927 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1928 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1929 ? "anchored" : "floating"),
1930 PL_colors[0],
1931 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1932 SvPVX(must),
b7953727
JH
1933 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1934 );
6eb5f6b9
JH
1935 goto phooey;
1936 }
155aba94 1937 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1938 if (minlen) {
1939 I32 op = (U8)OP(prog->regstclass);
66e933ab 1940 /* don't bother with what can't match */
f14c76ed
RGS
1941 if (PL_regkind[op] != EXACT && op != CANY)
1942 strend = HOPc(strend, -(minlen - 1));
1943 }
a3621e74 1944 DEBUG_EXECUTE_r({
ffc61ed2 1945 SV *prop = sv_newmortal();
9e55ce06
JH
1946 char *s0;
1947 char *s1;
1948 int len0;
1949 int len1;
1950
ffc61ed2 1951 regprop(prop, c);
9e55ce06
JH
1952 s0 = UTF ?
1953 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1954 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1955 SvPVX(prop);
1956 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1957 s1 = UTF ?
c728cb41 1958 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1959 len1 = UTF ? SvCUR(dsv1) : strend - s;
1960 PerlIO_printf(Perl_debug_log,
1961 "Matching stclass `%*.*s' against `%*.*s'\n",
1962 len0, len0, s0,
1963 len1, len1, s1);
ffc61ed2 1964 });
06b5626a 1965 if (find_byclass(prog, c, s, strend, 0))
6eb5f6b9 1966 goto got_it;
a3621e74 1967 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1968 }
1969 else {
1970 dontbother = 0;
33b8afdf
JH
1971 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1972 /* Trim the end. */
d6a28714 1973 char *last;
33b8afdf
JH
1974 SV* float_real;
1975
1976 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1977 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1978 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1979
1980 if (flags & REXEC_SCREAM) {
33b8afdf 1981 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1982 end_shift, &scream_pos, 1); /* last one */
1983 if (!last)
ffc61ed2 1984 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1985 /* we may be pointing at the wrong string */
1986 else if (RX_MATCH_COPIED(prog))
7ef91622 1987 s = strbeg + (s - SvPVX(sv));
b8c5462f 1988 }
d6a28714
JH
1989 else {
1990 STRLEN len;
06b5626a 1991 const char * const little = SvPV(float_real, len);
d6a28714 1992
33b8afdf 1993 if (SvTAIL(float_real)) {
d6a28714
JH
1994 if (memEQ(strend - len + 1, little, len - 1))
1995 last = strend - len + 1;
7fba1cd6 1996 else if (!multiline)
9041c2e3 1997 last = memEQ(strend - len, little, len)
d6a28714 1998 ? strend - len : Nullch;
b8c5462f 1999 else
d6a28714
JH
2000 goto find_last;
2001 } else {
2002 find_last:
9041c2e3 2003 if (len)
d6a28714 2004 last = rninstr(s, strend, little, little + len);
b8c5462f 2005 else
d6a28714 2006 last = strend; /* matching `$' */
b8c5462f 2007 }
b8c5462f 2008 }
bf93d4cc 2009 if (last == NULL) {
a3621e74 2010 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc
GS
2011 "%sCan't trim the tail, match fails (should not happen)%s\n",
2012 PL_colors[4],PL_colors[5]));
2013 goto phooey; /* Should not happen! */
2014 }
d6a28714
JH
2015 dontbother = strend - last + prog->float_min_offset;
2016 }
2017 if (minlen && (dontbother < minlen))
2018 dontbother = minlen - 1;
2019 strend -= dontbother; /* this one's always in bytes! */
2020 /* We don't know much -- general case. */
1aa99e6b 2021 if (do_utf8) {
d6a28714
JH
2022 for (;;) {
2023 if (regtry(prog, s))
2024 goto got_it;
2025 if (s >= strend)
2026 break;
b8c5462f 2027 s += UTF8SKIP(s);
d6a28714
JH
2028 };
2029 }
2030 else {
2031 do {
2032 if (regtry(prog, s))
2033 goto got_it;
2034 } while (s++ < strend);
2035 }
2036 }
2037
2038 /* Failure. */
2039 goto phooey;
2040
2041got_it:
2042 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2043
2044 if (PL_reg_eval_set) {
2045 /* Preserve the current value of $^R */
2046 if (oreplsv != GvSV(PL_replgv))
2047 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2048 restored, the value remains
2049 the same. */
acfe0abc 2050 restore_pos(aTHX_ 0);
d6a28714
JH
2051 }
2052
2053 /* make sure $`, $&, $', and $digit will work later */
2054 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2055 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2056 if (flags & REXEC_COPY_STR) {
2057 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2058#ifdef PERL_COPY_ON_WRITE
2059 if ((SvIsCOW(sv)
2060 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2061 if (DEBUG_C_TEST) {
2062 PerlIO_printf(Perl_debug_log,
2063 "Copy on write: regexp capture, type %d\n",
2064 (int) SvTYPE(sv));
2065 }
2066 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2067 prog->subbeg = SvPVX(prog->saved_copy);
2068 assert (SvPOKp(prog->saved_copy));
2069 } else
2070#endif
2071 {
2072 RX_MATCH_COPIED_on(prog);
2073 s = savepvn(strbeg, i);
2074 prog->subbeg = s;
2075 }
d6a28714 2076 prog->sublen = i;
d6a28714
JH
2077 }
2078 else {
2079 prog->subbeg = strbeg;
2080 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2081 }
2082 }
9041c2e3 2083
d6a28714
JH
2084 return 1;
2085
2086phooey:
a3621e74 2087 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
bf93d4cc 2088 PL_colors[4],PL_colors[5]));
d6a28714 2089 if (PL_reg_eval_set)
acfe0abc 2090 restore_pos(aTHX_ 0);
d6a28714
JH
2091 return 0;
2092}
2093
2094/*
2095 - regtry - try match at specific point
2096 */
2097STATIC I32 /* 0 failure, 1 success */
2098S_regtry(pTHX_ regexp *prog, char *startpos)
2099{
d6a28714
JH
2100 register I32 i;
2101 register I32 *sp;
2102 register I32 *ep;
2103 CHECKPOINT lastcp;
a3621e74 2104 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2105
02db2b7b
IZ
2106#ifdef DEBUGGING
2107 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2108#endif
d6a28714
JH
2109 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2110 MAGIC *mg;
2111
2112 PL_reg_eval_set = RS_init;
a3621e74 2113 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2114 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2115 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2116 ));
e8347627 2117 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2118 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2119 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2120 SAVETMPS;
2121 /* Apparently this is not needed, judging by wantarray. */
e8347627 2122 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2123 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2124
2125 if (PL_reg_sv) {
2126 /* Make $_ available to executed code. */
2127 if (PL_reg_sv != DEFSV) {
59f00321 2128 SAVE_DEFSV;
d6a28714 2129 DEFSV = PL_reg_sv;
b8c5462f 2130 }
d6a28714 2131
9041c2e3 2132 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2133 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2134 /* prepare for quick setting of pos */
14befaf4
DM
2135 sv_magic(PL_reg_sv, (SV*)0,
2136 PERL_MAGIC_regex_global, Nullch, 0);
2137 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2138 mg->mg_len = -1;
b8c5462f 2139 }
d6a28714
JH
2140 PL_reg_magic = mg;
2141 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2142 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2143 }
09687e5a 2144 if (!PL_reg_curpm) {
0f79a09d 2145 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2146#ifdef USE_ITHREADS
2147 {
2148 SV* repointer = newSViv(0);
577e12cc 2149 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2150 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2151 av_push(PL_regex_padav,repointer);
2152 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2153 PL_regex_pad = AvARRAY(PL_regex_padav);
2154 }
2155#endif
2156 }
aaa362c4 2157 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2158 PL_reg_oldcurpm = PL_curpm;
2159 PL_curpm = PL_reg_curpm;
2160 if (RX_MATCH_COPIED(prog)) {
2161 /* Here is a serious problem: we cannot rewrite subbeg,
2162 since it may be needed if this match fails. Thus
2163 $` inside (?{}) could fail... */
2164 PL_reg_oldsaved = prog->subbeg;
2165 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2166#ifdef PERL_COPY_ON_WRITE
2167 PL_nrs = prog->saved_copy;
2168#endif
d6a28714
JH
2169 RX_MATCH_COPIED_off(prog);
2170 }
2171 else
2172 PL_reg_oldsaved = Nullch;
2173 prog->subbeg = PL_bostr;
2174 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2175 }
2176 prog->startp[0] = startpos - PL_bostr;
2177 PL_reginput = startpos;
2178 PL_regstartp = prog->startp;
2179 PL_regendp = prog->endp;
2180 PL_reglastparen = &prog->lastparen;
a01268b5 2181 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2182 prog->lastparen = 0;
03994de8 2183 prog->lastcloseparen = 0;
d6a28714 2184 PL_regsize = 0;
a3621e74 2185 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2186 if (PL_reg_start_tmpl <= prog->nparens) {
2187 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2188 if(PL_reg_start_tmp)
2189 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2190 else
2191 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2192 }
2193
2194 /* XXXX What this code is doing here?!!! There should be no need
2195 to do this again and again, PL_reglastparen should take care of
3dd2943c 2196 this! --ilya*/
dafc8851
JH
2197
2198 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2199 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2200 * PL_reglastparen), is not needed at all by the test suite
2201 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2202 * enough, for building DynaLoader, or otherwise this
2203 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2204 * will happen. Meanwhile, this code *is* needed for the
2205 * above-mentioned test suite tests to succeed. The common theme
2206 * on those tests seems to be returning null fields from matches.
2207 * --jhi */
dafc8851 2208#if 1
d6a28714
JH
2209 sp = prog->startp;
2210 ep = prog->endp;
2211 if (prog->nparens) {
eb160463 2212 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2213 *++sp = -1;
2214 *++ep = -1;
2215 }
2216 }
dafc8851 2217#endif
02db2b7b 2218 REGCP_SET(lastcp);
d6a28714
JH
2219 if (regmatch(prog->program + 1)) {
2220 prog->endp[0] = PL_reginput - PL_bostr;
2221 return 1;
2222 }
02db2b7b 2223 REGCP_UNWIND(lastcp);
d6a28714
JH
2224 return 0;
2225}
2226
02db2b7b
IZ
2227#define RE_UNWIND_BRANCH 1
2228#define RE_UNWIND_BRANCHJ 2
2229
2230union re_unwind_t;
2231
2232typedef struct { /* XX: makes sense to enlarge it... */
2233 I32 type;
2234 I32 prev;
2235 CHECKPOINT lastcp;
2236} re_unwind_generic_t;
2237
2238typedef struct {
2239 I32 type;
2240 I32 prev;
2241 CHECKPOINT lastcp;
2242 I32 lastparen;
2243 regnode *next;
2244 char *locinput;
2245 I32 nextchr;
2246#ifdef DEBUGGING
2247 int regindent;
2248#endif
2249} re_unwind_branch_t;
2250
2251typedef union re_unwind_t {
2252 I32 type;
2253 re_unwind_generic_t generic;
2254 re_unwind_branch_t branch;
2255} re_unwind_t;
2256
8ba1375e
MJD
2257#define sayYES goto yes
2258#define sayNO goto no
e0f9d4a8 2259#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2260#define sayYES_FINAL goto yes_final
2261#define sayYES_LOUD goto yes_loud
2262#define sayNO_FINAL goto no_final
2263#define sayNO_SILENT goto do_no
2264#define saySAME(x) if (x) goto yes; else goto no
2265
3ab3c9b4
HS
2266#define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2267#define POSCACHE_SEEN 1 /* we know what we're caching */
2268#define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2269#define CACHEsayYES STMT_START { \
2270 if (cache_offset | cache_bit) { \
2271 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2272 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2273 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2274 /* cache records failure, but this is success */ \
2275 DEBUG_r( \
2276 PerlIO_printf(Perl_debug_log, \
2277 "%*s (remove success from failure cache)\n", \
2278 REPORT_CODE_OFF+PL_regindent*2, "") \
2279 ); \
2280 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2281 } \
2282 } \
2283 sayYES; \
2284} STMT_END
2285#define CACHEsayNO STMT_START { \
2286 if (cache_offset | cache_bit) { \
2287 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2288 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2289 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2290 /* cache records success, but this is failure */ \
2291 DEBUG_r( \
2292 PerlIO_printf(Perl_debug_log, \
2293 "%*s (remove failure from success cache)\n", \
2294 REPORT_CODE_OFF+PL_regindent*2, "") \
2295 ); \
2296 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2297 } \
2298 } \
2299 sayNO; \
2300} STMT_END
2301
a3621e74
YO
2302/* this is used to determine how far from the left messages like
2303 'failed...' are printed. Currently 29 makes these messages line
2304 up with the opcode they refer to. Earlier perls used 25 which
2305 left these messages outdented making reviewing a debug output
2306 quite difficult.
2307*/
2308#define REPORT_CODE_OFF 29
2309
2310
2311/* Make sure there is a test for this +1 options in re_tests */
2312#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2313
2314#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2315 if ( trie->states[ state ].wordnum ) { \
2316 if ( !accepted ) { \
2317 ENTER; \
2318 SAVETMPS; \
2319 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2320 sv_accept_buff=NEWSV( 1234, \
2321 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2322 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2323 SvPOK_on( sv_accept_buff ); \
2324 sv_2mortal( sv_accept_buff ); \
2325 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2326 } else { \
2327 if ( accepted >= bufflen ) { \
2328 bufflen *= 2; \
2329 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2330 bufflen * sizeof(reg_trie_accepted) ); \
2331 } \
2332 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2333 + sizeof( reg_trie_accepted ) ); \
2334 } \
2335 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2336 accept_buff[ accepted ].endpos = uc; \
2337 ++accepted; \
2338 } } STMT_END
2339
2340#define TRIE_HANDLE_CHAR STMT_START { \
2341 if ( uvc < 256 ) { \
2342 charid = trie->charmap[ uvc ]; \
2343 } else { \
2344 charid = 0; \
2345 if( trie->widecharmap ) { \
2346 SV** svpp = (SV**)NULL; \
2347 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2348 sizeof( UV ), 0 ); \
2349 if ( svpp ) { \
2350 charid = (U16)SvIV( *svpp ); \
2351 } \
2352 } \
2353 } \
2354 if ( charid && \
cc601c31
YO
2355 ( base + charid > trie->uniquecharcount ) && \
2356 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2357 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2358 { \
2359 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2360 } else { \
2361 state = 0; \
2362 } \
2363 uc += len; \
2364 } STMT_END
8ba1375e 2365
d6a28714
JH
2366/*
2367 - regmatch - main matching routine
2368 *
2369 * Conceptually the strategy is simple: check to see whether the current
2370 * node matches, call self recursively to see whether the rest matches,
2371 * and then act accordingly. In practice we make some effort to avoid
2372 * recursion, in particular by going through "ordinary" nodes (that don't
2373 * need to know whether the rest of the match failed) by a loop instead of
2374 * by recursion.
2375 */
2376/* [lwall] I've hoisted the register declarations to the outer block in order to
2377 * maybe save a little bit of pushing and popping on the stack. It also takes
2378 * advantage of machines that use a register save mask on subroutine entry.
2379 */
2380STATIC I32 /* 0 failure, 1 success */
2381S_regmatch(pTHX_ regnode *prog)
2382{
d6a28714
JH
2383 register regnode *scan; /* Current node. */
2384 regnode *next; /* Next node. */
2385 regnode *inner; /* Next node in internal branch. */
2386 register I32 nextchr; /* renamed nextchr - nextchar colides with
2387 function of same name */
2388 register I32 n; /* no or next */
b7953727
JH
2389 register I32 ln = 0; /* len or last */
2390 register char *s = Nullch; /* operand or save */
d6a28714 2391 register char *locinput = PL_reginput;
b7953727 2392 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2393 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2394 I32 unwind = 0;
a3621e74
YO
2395
2396 /* used by the trie code */
2397 SV *sv_accept_buff; /* accepting states we have traversed */
2398 reg_trie_accepted *accept_buff; /* "" */
2399 reg_trie_data *trie; /* what trie are we using right now */
2400 U32 accepted = 0; /* how many accepting states we have seen*/
2401
b7953727 2402#if 0
02db2b7b 2403 I32 firstcp = PL_savestack_ix;
b7953727 2404#endif
53c4c00c 2405 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2406#ifdef DEBUGGING
ce333219
JH
2407 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2408 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2409 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74
YO
2410
2411 SV *re_debug_flags;
2a782b5b 2412#endif
02db2b7b 2413
a3621e74
YO
2414 GET_RE_DEBUG_FLAGS;
2415
d6a28714
JH
2416#ifdef DEBUGGING
2417 PL_regindent++;
2418#endif
2419
a3621e74 2420
d6a28714
JH
2421 /* Note that nextchr is a byte even in UTF */
2422 nextchr = UCHARAT(locinput);
2423 scan = prog;
2424 while (scan != NULL) {
8ba1375e 2425
a3621e74 2426 DEBUG_EXECUTE_r( {
d6a28714
JH
2427 SV *prop = sv_newmortal();
2428 int docolor = *PL_colors[0];
2429 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2430 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2431 /* The part of the string before starttry has one color
2432 (pref0_len chars), between starttry and current
2433 position another one (pref_len - pref0_len chars),
2434 after the current position the third one.
2435 We assume that pref0_len <= pref_len, otherwise we
2436 decrease pref0_len. */
9041c2e3 2437 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2438 ? (5 + taill) - l : locinput - PL_bostr;
2439 int pref0_len;
d6a28714 2440
df1ffd02 2441 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2442 pref_len++;
2443 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2444 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2445 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2446 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2447 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2448 l--;
d6a28714
JH
2449 if (pref0_len < 0)
2450 pref0_len = 0;
2451 if (pref0_len > pref_len)
2452 pref0_len = pref_len;
2453 regprop(prop, scan);
2a782b5b
JH
2454 {
2455 char *s0 =
f14c76ed 2456 do_utf8 && OP(scan) != CANY ?
2a782b5b 2457 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2458 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2459 locinput - pref_len;
df1ffd02 2460 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2461 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2462 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2463 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2464 locinput - pref_len + pref0_len;
df1ffd02 2465 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2466 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2467 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2468 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2469 locinput;
df1ffd02 2470 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2471 PerlIO_printf(Perl_debug_log,
2472 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2473 (IV)(locinput - PL_bostr),
2474 PL_colors[4],
2475 len0, s0,
2476 PL_colors[5],
2477 PL_colors[2],
2478 len1, s1,
2479 PL_colors[3],
2480 (docolor ? "" : "> <"),
2481 PL_colors[0],
2482 len2, s2,
2483 PL_colors[1],
2484 15 - l - pref_len + 1,
2485 "",
2486 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2487 SvPVX(prop));
2488 }
2489 });
d6a28714
JH
2490
2491 next = scan + NEXT_OFF(scan);
2492 if (next == scan)
2493 next = NULL;
2494
2495 switch (OP(scan)) {
2496 case BOL:
7fba1cd6 2497 if (locinput == PL_bostr)
d6a28714
JH
2498 {
2499 /* regtill = regbol; */
b8c5462f
JH
2500 break;
2501 }
d6a28714
JH
2502 sayNO;
2503 case MBOL:
12d33761
HS
2504 if (locinput == PL_bostr ||
2505 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2506 {
b8c5462f
JH
2507 break;
2508 }
d6a28714
JH
2509 sayNO;
2510 case SBOL:
c2a73568 2511 if (locinput == PL_bostr)
b8c5462f 2512 break;
d6a28714
JH
2513 sayNO;
2514 case GPOS:
2515 if (locinput == PL_reg_ganch)
2516 break;
2517 sayNO;
2518 case EOL:
d6a28714
JH
2519 goto seol;
2520 case MEOL:
d6a28714 2521 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2522 sayNO;
b8c5462f 2523 break;
d6a28714
JH
2524 case SEOL:
2525 seol:
2526 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2527 sayNO;
d6a28714 2528 if (PL_regeol - locinput > 1)
b8c5462f 2529 sayNO;
b8c5462f 2530 break;
d6a28714
JH
2531 case EOS:
2532 if (PL_regeol != locinput)
b8c5462f 2533 sayNO;
d6a28714 2534 break;
ffc61ed2 2535 case SANY:
d6a28714 2536 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2537 sayNO;
f33976b4
DB
2538 if (do_utf8) {
2539 locinput += PL_utf8skip[nextchr];
2540 if (locinput > PL_regeol)
2541 sayNO;
2542 nextchr = UCHARAT(locinput);
2543 }
2544 else
2545 nextchr = UCHARAT(++locinput);
2546 break;
2547 case CANY:
2548 if (!nextchr && locinput >= PL_regeol)
2549 sayNO;
b8c5462f 2550 nextchr = UCHARAT(++locinput);
a0d0e21e 2551 break;
ffc61ed2 2552 case REG_ANY:
1aa99e6b
IH
2553 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2554 sayNO;
2555 if (do_utf8) {
b8c5462f 2556 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2557 if (locinput > PL_regeol)
2558 sayNO;
a0ed51b3 2559 nextchr = UCHARAT(locinput);
a0ed51b3 2560 }
1aa99e6b
IH
2561 else
2562 nextchr = UCHARAT(++locinput);
a0ed51b3 2563 break;
a3621e74
YO
2564
2565
2566
2567 /*
2568 traverse the TRIE keeping track of all accepting states
2569 we transition through until we get to a failing node.
2570
2571 we use two slightly different pieces of code to handle
2572 the traversal depending on whether its case sensitive or
2573 not. we reuse the accept code however. (this should probably
2574 be turned into a macro.)
2575
2576 */
2577 case TRIEF:
2578 case TRIEFL:
2579 {
2580
2581 U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2582 U8 *uc = ( U8* )locinput;
2583 U32 state = 1;
2584 U16 charid = 0;
2585 U32 base = 0;
2586 UV uvc = 0;
2587 STRLEN len = 0;
2588 STRLEN foldlen = 0;
2589 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2590 U8 *uscan = (U8*)NULL;
2591 STRLEN bufflen=0;
2592 accepted = 0;
2593
2594 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2595
2596 while ( state && uc <= (U8*)PL_regeol ) {
2597
2598 TRIE_CHECK_STATE_IS_ACCEPTING;
2599
2600 base = trie->states[ state ].trans.base;
2601
2602 DEBUG_TRIE_EXECUTE_r(
2603 PerlIO_printf( Perl_debug_log,
2604 "%*s %sState: %4x, Base: %4x Accepted: %4x ",
2605 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2606 state, base, accepted );
2607 );
2608
2609 if ( base ) {
2610
2611 if ( do_utf8 || UTF ) {
2612 if ( foldlen>0 ) {
2613 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2614 foldlen -= len;
2615 uscan += len;
2616 len=0;
2617 } else {
2618 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2619 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2620 foldlen -= UNISKIP( uvc );
2621 uscan = foldbuf + UNISKIP( uvc );
2622 }
2623 } else {
2624 uvc = (U32)*uc;
2625 len = 1;
2626 }
2627
2628 TRIE_HANDLE_CHAR;
2629
2630 } else {
2631 state = 0;
2632 }
2633 DEBUG_TRIE_EXECUTE_r(
2634 PerlIO_printf( Perl_debug_log,
2635 "Charid:%3x CV:%4x After State: %4x%s\n",
2636 charid, uvc, state, PL_colors[5] );
2637 );
2638 }
2639 if ( !accepted ) {
2640 sayNO;
2641 } else {
2642 goto TrieAccept;
2643 }
2644 }
2645 /* unreached codepoint: we jump into the middle of the next case
2646 from previous if blocks */
2647 case TRIE:
2648 {
2649 U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2650 U8 *uc = (U8*)locinput;
2651 U32 state = 1;
2652 U16 charid = 0;
2653 U32 base = 0;
2654 UV uvc = 0;
2655 STRLEN len = 0;
2656 STRLEN bufflen = 0;
2657 accepted = 0;
2658
2659 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2660
2661 while ( state && uc <= (U8*)PL_regeol ) {
2662
2663 TRIE_CHECK_STATE_IS_ACCEPTING;
2664
2665 base = trie->states[ state ].trans.base;
2666
2667 DEBUG_TRIE_EXECUTE_r(
2668 PerlIO_printf( Perl_debug_log,
2669 "%*s %sState: %4x, Base: %4x Accepted: %4x ",
2670 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2671 state, base, accepted );
2672 );
2673
2674 if ( base ) {
2675
2676 if ( do_utf8 || UTF ) {
2677 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2678 } else {
2679 uvc = (U32)*uc;
2680 len = 1;
2681 }
2682
2683 TRIE_HANDLE_CHAR;
2684
2685 } else {
2686 state = 0;
2687 }
2688 DEBUG_TRIE_EXECUTE_r(
2689 PerlIO_printf( Perl_debug_log,
2690 "Charid:%3x CV:%4x After State: %4x%s\n",
2691 charid, uvc, state, PL_colors[5] );
2692 );
2693 }
2694 if ( !accepted ) {
2695 sayNO;
2696 }
2697 }
2698
2699
2700 /*
2701 There was at least one accepting state that we
2702 transitioned through. Presumably the number of accepting
2703 states is going to be low, typically one or two. So we
2704 simply scan through to find the one with lowest wordnum.
2705 Once we find it, we swap the last state into its place
2706 and decrement the size. We then try to match the rest of
2707 the pattern at the point where the word ends, if we
2708 succeed then we end the loop, otherwise the loop
2709 eventually terminates once all of the accepting states
2710 have been tried.
2711 */
2712 TrieAccept:
2713 {
2714 int gotit = 0;
2715
2716 if ( accepted == 1 ) {
2717 DEBUG_EXECUTE_r({
2718 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2719 PerlIO_printf( Perl_debug_log,
2720 "%*s %sonly one match : #%d <%s>%s\n",
2721 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],
2722 accept_buff[ 0 ].wordnum,
2723 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",
2724 PL_colors[5] );
2725 });
cc601c31 2726 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2727 /* in this case we free tmps/leave before we call regmatch
2728 as we wont be using accept_buff again. */
2729 FREETMPS;
2730 LEAVE;
2731 gotit = regmatch( scan + NEXT_OFF( scan ) );
2732 } else {
2733 DEBUG_EXECUTE_r(
2734 PerlIO_printf( Perl_debug_log,"%*s %sgot %d possible matches%s\n",
2735 REPORT_CODE_OFF + PL_regindent * 2, "",PL_colors[4], accepted,
2736 PL_colors[5] );
2737 );
2738 while ( !gotit && accepted-- ) {
2739 U32 best = 0;
2740 U32 cur;
2741 for( cur = 1 ; cur <= accepted ; cur++ ) {
2742 DEBUG_TRIE_EXECUTE_r(
2743 PerlIO_printf( Perl_debug_log,
2744 "%*s %sgot %d (%d) as best, looking at %d (%d)%s\n",
2745 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2746 best, accept_buff[ best ].wordnum, cur,
2747 accept_buff[ cur ].wordnum, PL_colors[5] );
2748 );
2749
2750 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2751 best = cur;
2752 }
2753 DEBUG_EXECUTE_r({
2754 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2755 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2756 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],
2757 accept_buff[best].wordnum,
2758 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",scan,
2759 PL_colors[5] );
2760 });
2761 if ( best<accepted ) {
2762 reg_trie_accepted tmp = accept_buff[ best ];
2763 accept_buff[ best ] = accept_buff[ accepted ];
2764 accept_buff[ accepted ] = tmp;
2765 best = accepted;
2766 }
cc601c31 2767 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2768
2769 /*
2770 as far as I can tell we only need the SAVETMPS/FREETMPS
2771 for re's with EVAL in them but I'm leaving them in for
2772 all until I can be sure.
2773 */
2774 SAVETMPS;
2775 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2776 FREETMPS;
2777 }
2778 FREETMPS;
2779 LEAVE;
2780 }
2781
2782 if ( gotit ) {
2783 sayYES;
2784 } else {
2785 sayNO;
2786 }
2787 }
2788 /* unreached codepoint */
d6a28714 2789 case EXACT:
cd439c50
IZ
2790 s = STRING(scan);
2791 ln = STR_LEN(scan);
eb160463 2792 if (do_utf8 != UTF) {
bc517b45 2793 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2794 char *l = locinput;
2795 char *e = s + ln;
bc517b45 2796 STRLEN ulen;
a72c7584 2797
5ff6fc6d
JH
2798 if (do_utf8) {
2799 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2800 while (s < e) {
2801 if (l >= PL_regeol)
5ff6fc6d
JH
2802 sayNO;
2803 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2804 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2805 ckWARN(WARN_UTF8) ?
2806 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2807 sayNO;
bc517b45 2808 l += ulen;
5ff6fc6d 2809 s ++;
1aa99e6b 2810 }
5ff6fc6d
JH
2811 }
2812 else {
2813 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2814 while (s < e) {
2815 if (l >= PL_regeol)
2816 sayNO;
5ff6fc6d 2817 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2818 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2819 ckWARN(WARN_UTF8) ?
2820 0 : UTF8_ALLOW_ANY))
1aa99e6b 2821 sayNO;
bc517b45 2822 s += ulen;
a72c7584 2823 l ++;
1aa99e6b 2824 }
5ff6fc6d 2825 }
1aa99e6b
IH
2826 locinput = l;
2827 nextchr = UCHARAT(locinput);
2828 break;
2829 }
bc517b45 2830 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2831 /* Inline the first character, for speed. */
2832 if (UCHARAT(s) != nextchr)
2833 sayNO;
2834 if (PL_regeol - locinput < ln)
2835 sayNO;
2836 if (ln > 1 && memNE(s, locinput, ln))
2837 sayNO;
2838 locinput += ln;
2839 nextchr = UCHARAT(locinput);
2840 break;
2841 case EXACTFL:
b8c5462f
JH
2842 PL_reg_flags |= RF_tainted;
2843 /* FALL THROUGH */
d6a28714 2844 case EXACTF:
cd439c50
IZ
2845 s = STRING(scan);
2846 ln = STR_LEN(scan);
d6a28714 2847
d07ddd77
JH
2848 if (do_utf8 || UTF) {
2849 /* Either target or the pattern are utf8. */
d6a28714 2850 char *l = locinput;
d07ddd77 2851 char *e = PL_regeol;
bc517b45 2852
eb160463 2853 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2854 l, &e, 0, do_utf8)) {
5486206c
JH
2855 /* One more case for the sharp s:
2856 * pack("U0U*", 0xDF) =~ /ss/i,
2857 * the 0xC3 0x9F are the UTF-8
2858 * byte sequence for the U+00DF. */
2859 if (!(do_utf8 &&
2860 toLOWER(s[0]) == 's' &&
2861 ln >= 2 &&
2862 toLOWER(s[1]) == 's' &&
2863 (U8)l[0] == 0xC3 &&
2864 e - l >= 2 &&
2865 (U8)l[1] == 0x9F))
2866 sayNO;
2867 }
d07ddd77
JH
2868 locinput = e;
2869 nextchr = UCHARAT(locinput);
2870 break;
a0ed51b3 2871 }
d6a28714 2872
bc517b45
JH
2873 /* Neither the target and the pattern are utf8. */
2874
d6a28714
JH
2875 /* Inline the first character, for speed. */
2876 if (UCHARAT(s) != nextchr &&
2877 UCHARAT(s) != ((OP(scan) == EXACTF)
2878 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2879 sayNO;
d6a28714 2880 if (PL_regeol - locinput < ln)
b8c5462f 2881 sayNO;
d6a28714
JH
2882 if (ln > 1 && (OP(scan) == EXACTF
2883 ? ibcmp(s, locinput, ln)
2884 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2885 sayNO;
d6a28714
JH
2886 locinput += ln;
2887 nextchr = UCHARAT(locinput);
a0d0e21e 2888 break;
d6a28714 2889 case ANYOF:
ffc61ed2 2890 if (do_utf8) {
9e55ce06
JH
2891 STRLEN inclasslen = PL_regeol - locinput;
2892
ba7b4546 2893 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2894 sayNO_ANYOF;
ffc61ed2
JH
2895 if (locinput >= PL_regeol)
2896 sayNO;
0f0076b4 2897 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2898 nextchr = UCHARAT(locinput);
e0f9d4a8 2899 break;
ffc61ed2
JH
2900 }
2901 else {
2902 if (nextchr < 0)
2903 nextchr = UCHARAT(locinput);
7d3e948e 2904 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2905 sayNO_ANYOF;
ffc61ed2
JH
2906 if (!nextchr && locinput >= PL_regeol)
2907 sayNO;
2908 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2909 break;
2910 }
2911 no_anyof:
2912 /* If we might have the case of the German sharp s
2913 * in a casefolding Unicode character class. */
2914
ebc501f0
JH
2915 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2916 locinput += SHARP_S_SKIP;
e0f9d4a8 2917 nextchr = UCHARAT(locinput);
ffc61ed2 2918 }
e0f9d4a8
JH
2919 else
2920 sayNO;
b8c5462f 2921 break;
d6a28714 2922 case ALNUML:
b8c5462f
JH
2923 PL_reg_flags |= RF_tainted;
2924 /* FALL THROUGH */
d6a28714 2925 case ALNUM:
b8c5462f 2926 if (!nextchr)
4633a7c4 2927 sayNO;
ffc61ed2 2928 if (do_utf8) {
ad24be35 2929 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2930 if (!(OP(scan) == ALNUM
3568d838 2931 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2932 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2933 {
2934 sayNO;
a0ed51b3 2935 }
b8c5462f 2936 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2937 nextchr = UCHARAT(locinput);
2938 break;
2939 }
ffc61ed2 2940 if (!(OP(scan) == ALNUM
d6a28714 2941 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2942 sayNO;
b8c5462f 2943 nextchr = UCHARAT(++locinput);
a0d0e21e 2944 break;
d6a28714 2945 case NALNUML:
b8c5462f
JH
2946 PL_reg_flags |= RF_tainted;
2947 /* FALL THROUGH */
d6a28714
JH
2948 case NALNUM:
2949 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2950 sayNO;
ffc61ed2 2951 if (do_utf8) {
8269fa76 2952 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2953 if (OP(scan) == NALNUM
3568d838 2954 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2955 : isALNUM_LC_utf8((U8*)locinput))
2956 {
b8c5462f 2957 sayNO;
d6a28714 2958 }
b8c5462f
JH
2959 locinput += PL_utf8skip[nextchr];
2960 nextchr = UCHARAT(locinput);
2961 break;
2962 }
ffc61ed2 2963 if (OP(scan) == NALNUM
d6a28714 2964 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2965 sayNO;
76e3520e 2966 nextchr = UCHARAT(++locinput);
a0d0e21e 2967 break;
d6a28714
JH
2968 case BOUNDL:
2969 case NBOUNDL:
3280af22 2970 PL_reg_flags |= RF_tainted;
bbce6d69 2971 /* FALL THROUGH */
d6a28714
JH
2972 case BOUND:
2973 case NBOUND:
2974 /* was last char in word? */
ffc61ed2 2975 if (do_utf8) {
12d33761
HS
2976 if (locinput == PL_bostr)
2977 ln = '\n';
ffc61ed2 2978 else {
b4f7163a 2979 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2980
b4f7163a 2981 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2982 }
2983 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2984 ln = isALNUM_uni(ln);
8269fa76 2985 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2986 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2987 }
2988 else {
9041c2e3 2989 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2990 n = isALNUM_LC_utf8((U8*)locinput);
2991 }
a0ed51b3 2992 }
d6a28714 2993 else {
12d33761
HS
2994 ln = (locinput != PL_bostr) ?
2995 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2996 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2997 ln = isALNUM(ln);
2998 n = isALNUM(nextchr);
2999 }
3000 else {
3001 ln = isALNUM_LC(ln);
3002 n = isALNUM_LC(nextchr);
3003 }
d6a28714 3004 }
ffc61ed2
JH
3005 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3006 OP(scan) == BOUNDL))
3007 sayNO;
a0ed51b3 3008 break;
d6a28714 3009 case SPACEL:
3280af22 3010 PL_reg_flags |= RF_tainted;
bbce6d69 3011 /* FALL THROUGH */
d6a28714 3012 case SPACE:
9442cb0e 3013 if (!nextchr)
4633a7c4 3014 sayNO;
1aa99e6b 3015 if (do_utf8) {
fd400ab9 3016 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 3017 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 3018 if (!(OP(scan) == SPACE
3568d838 3019 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
3020 : isSPACE_LC_utf8((U8*)locinput)))
3021 {
3022 sayNO;
3023 }
3024 locinput += PL_utf8skip[nextchr];
3025 nextchr = UCHARAT(locinput);
3026 break;
d6a28714 3027 }
ffc61ed2
JH
3028 if (!(OP(scan) == SPACE
3029 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3030 sayNO;
3031 nextchr = UCHARAT(++locinput);
3032 }
3033 else {
3034 if (!(OP(scan) == SPACE
3035 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3036 sayNO;
3037 nextchr = UCHARAT(++locinput);
a0ed51b3 3038 }
a0ed51b3 3039 break;
d6a28714 3040 case NSPACEL:
3280af22 3041 PL_reg_flags |= RF_tainted;
bbce6d69 3042 /* FALL THROUGH */
d6a28714 3043 case NSPACE:
9442cb0e 3044 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3045 sayNO;
1aa99e6b 3046 if (do_utf8) {
8269fa76 3047 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 3048 if (OP(scan) == NSPACE
3568d838 3049 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3050 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3051 {
3052 sayNO;
3053 }
3054 locinput += PL_utf8skip[nextchr];
3055 nextchr = UCHARAT(locinput);
3056 break;
a0ed51b3 3057 }
ffc61ed2 3058 if (OP(scan) == NSPACE
d6a28714 3059 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3060 sayNO;
76e3520e 3061 nextchr = UCHARAT(++locinput);
a0d0e21e 3062 break;
d6a28714 3063 case DIGITL:
a0ed51b3
LW
3064 PL_reg_flags |= RF_tainted;
3065 /* FALL THROUGH */
d6a28714 3066 case DIGIT:
9442cb0e 3067 if (!nextchr)
a0ed51b3 3068 sayNO;
1aa99e6b 3069 if (do_utf8) {
8269fa76 3070 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 3071 if (!(OP(scan) == DIGIT
3568d838 3072 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3073 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3074 {
a0ed51b3 3075 sayNO;
dfe13c55 3076 }
6f06b55f 3077 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3078 nextchr = UCHARAT(locinput);
3079 break;
3080 }
ffc61ed2 3081 if (!(OP(scan) == DIGIT
9442cb0e 3082 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3083 sayNO;
3084 nextchr = UCHARAT(++locinput);
3085 break;
d6a28714 3086 case NDIGITL:
b8c5462f
JH
3087 PL_reg_flags |= RF_tainted;
3088 /* FALL THROUGH */
d6a28714 3089 case NDIGIT:
9442cb0e 3090 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3091 sayNO;
1aa99e6b 3092 if (do_utf8) {
8269fa76 3093 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 3094 if (OP(scan) == NDIGIT
3568d838 3095 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3096 : isDIGIT_LC_utf8((U8*)locinput))
3097 {
a0ed51b3 3098 sayNO;
9442cb0e 3099 }
6f06b55f 3100 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3101 nextchr = UCHARAT(locinput);
3102 break;
3103 }
ffc61ed2 3104 if (OP(scan) == NDIGIT
9442cb0e 3105 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3106 sayNO;
3107 nextchr = UCHARAT(++locinput);
3108 break;
3109 case CLUMP:
b7c83a7e 3110 if (locinput >= PL_regeol)
a0ed51b3 3111 sayNO;
b7c83a7e
JH
3112 if (do_utf8) {
3113 LOAD_UTF8_CHARCLASS(mark,"~");
3114 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3115 sayNO;
3116 locinput += PL_utf8skip[nextchr];
3117 while (locinput < PL_regeol &&
3118 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3119 locinput += UTF8SKIP(locinput);
3120 if (locinput > PL_regeol)
3121 sayNO;
eb08e2da
JH
3122 }
3123 else
3124 locinput++;
a0ed51b3
LW
3125 nextchr = UCHARAT(locinput);
3126 break;
c8756f30 3127 case REFFL:
3280af22 3128 PL_reg_flags |= RF_tainted;
c8756f30 3129 /* FALL THROUGH */
c277df42 3130 case REF:
c8756f30 3131 case REFF:
c277df42 3132 n = ARG(scan); /* which paren pair */
cf93c79d 3133 ln = PL_regstartp[n];
2c2d71f5 3134 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3135 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3136 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3137 if (ln == PL_regendp[n])
a0d0e21e 3138 break;
a0ed51b3 3139
cf93c79d 3140 s = PL_bostr + ln;
1aa99e6b 3141 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3142 char *l = locinput;
cf93c79d 3143 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3144 /*
3145 * Note that we can't do the "other character" lookup trick as
3146 * in the 8-bit case (no pun intended) because in Unicode we
3147 * have to map both upper and title case to lower case.
3148 */
3149 if (OP(scan) == REFF) {
a2a2844f 3150 STRLEN ulen1, ulen2;
89ebb4a3
JH
3151 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3152 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a0ed51b3
LW
3153 while (s < e) {
3154 if (l >= PL_regeol)
3155 sayNO;
a2a2844f
JH
3156 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3157 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3158 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3159 sayNO;
a2a2844f
JH
3160 s += ulen1;
3161 l += ulen2;
a0ed51b3
LW
3162 }
3163 }
3164 locinput = l;
3165 nextchr = UCHARAT(locinput);
3166 break;
3167 }
3168
a0d0e21e 3169 /* Inline the first character, for speed. */
76e3520e 3170 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3171 (OP(scan) == REF ||
3172 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3173 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3174 sayNO;
cf93c79d 3175 ln = PL_regendp[n] - ln;
3280af22 3176 if (locinput + ln > PL_regeol)
4633a7c4 3177 sayNO;
c8756f30
AK
3178 if (ln > 1 && (OP(scan) == REF
3179 ? memNE(s, locinput, ln)
3180 : (OP(scan) == REFF
3181 ? ibcmp(s, locinput, ln)
3182 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3183 sayNO;
a0d0e21e 3184 locinput += ln;
76e3520e 3185 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3186 break;
3187
3188 case NOTHING:
c277df42 3189 case TAIL:
a0d0e21e
LW
3190 break;
3191 case BACK:
3192 break;
c277df42
IZ
3193 case EVAL:
3194 {
3195 dSP;
533c011a 3196 OP_4tree *oop = PL_op;
3280af22 3197 COP *ocurcop = PL_curcop;
f3548bdc 3198 PAD *old_comppad;
c277df42 3199 SV *ret;
080c2dec 3200 struct regexp *oreg = PL_reg_re;
9041c2e3 3201
c277df42 3202 n = ARG(scan);
533c011a 3203 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3204 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3205 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3206 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3207
8e5e9ebe
RGS
3208 {
3209 SV **before = SP;
3210 CALLRUNOPS(aTHX); /* Scalar context. */
3211 SPAGAIN;
3212 if (SP == before)
075aa684 3213 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3214 else {
3215 ret = POPs;
3216 PUTBACK;
3217 }
3218 }
3219
0f5d15d6 3220 PL_op = oop;
f3548bdc 3221 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3222 PL_curcop = ocurcop;
c277df42 3223 if (logical) {
0f5d15d6
IZ
3224 if (logical == 2) { /* Postponed subexpression. */
3225 regexp *re;
22c35a8c 3226 MAGIC *mg = Null(MAGIC*);
0f5d15d6 3227 re_cc_state state;
0f5d15d6 3228 CHECKPOINT cp, lastcp;
cb50f42d 3229 int toggleutf;
faf82a0b 3230 register SV *sv;
0f5d15d6 3231
faf82a0b
AE
3232 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3233 mg = mg_find(sv, PERL_MAGIC_qr);
3234 else if (SvSMAGICAL(ret)) {
3235 if (SvGMAGICAL(ret))
3236 sv_unmagic(ret, PERL_MAGIC_qr);
3237 else
3238 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3239 }
faf82a0b 3240
0f5d15d6
IZ
3241 if (mg) {
3242 re = (regexp *)mg->mg_obj;
df0003d4 3243 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3244 }
3245 else {
3246 STRLEN len;
3247 char *t = SvPV(ret, len);
3248 PMOP pm;
3249 char *oprecomp = PL_regprecomp;
3250 I32 osize = PL_regsize;
3251 I32 onpar = PL_regnpar;
3252
5fcd1c1b 3253 Zero(&pm, 1, PMOP);
cb50f42d 3254 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 3255 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 3256 if (!(SvFLAGS(ret)
faf82a0b
AE
3257 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3258 | SVs_GMG)))
14befaf4
DM
3259 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3260 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3261 PL_regprecomp = oprecomp;
3262 PL_regsize = osize;
3263 PL_regnpar = onpar;
3264 }
a3621e74 3265 DEBUG_EXECUTE_r(
9041c2e3 3266 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
3267 "Entering embedded `%s%.60s%s%s'\n",
3268 PL_colors[0],
3269 re->precomp,
3270 PL_colors[1],
3271 (strlen(re->precomp) > 60 ? "..." : ""))
3272 );
3273 state.node = next;
3274 state.prev = PL_reg_call_cc;
3275 state.cc = PL_regcc;
3276 state.re = PL_reg_re;
3277
2ab05381 3278 PL_regcc = 0;
9041c2e3 3279
0f5d15d6 3280 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3281 REGCP_SET(lastcp);
0f5d15d6
IZ
3282 cache_re(re);
3283 state.ss = PL_savestack_ix;
3284 *PL_reglastparen = 0;
a01268b5 3285 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3286 PL_reg_call_cc = &state;
3287 PL_reginput = locinput;
cb50f42d
YST
3288 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3289 ((re->reganch & ROPT_UTF8) != 0);
3290 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3291
3292 /* XXXX This is too dramatic a measure... */
3293 PL_reg_maxiter = 0;
3294
0f5d15d6 3295 if (regmatch(re->program + 1)) {
2c914db6
IZ
3296 /* Even though we succeeded, we need to restore
3297 global variables, since we may be wrapped inside
3298 SUSPEND, thus the match may be not finished yet. */
3299
3300 /* XXXX Do this only if SUSPENDed? */
3301 PL_reg_call_cc = state.prev;
3302 PL_regcc = state.cc;
3303 PL_reg_re = state.re;
3304 cache_re(PL_reg_re);
cb50f42d 3305 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3306
3307 /* XXXX This is too dramatic a measure... */
3308 PL_reg_maxiter = 0;
3309
3310 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3311 ReREFCNT_dec(re);
3312 regcpblow(cp);
3313 sayYES;
3314 }
0f5d15d6 3315 ReREFCNT_dec(re);
02db2b7b 3316 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3317 regcppop();
3318 PL_reg_call_cc = state.prev;
3319 PL_regcc = state.cc;
3320 PL_reg_re = state.re;
d3790889 3321 cache_re(PL_reg_re);
cb50f42d 3322 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3323
3324 /* XXXX This is too dramatic a measure... */
3325 PL_reg_maxiter = 0;
3326
8e514ae6 3327 logical = 0;
0f5d15d6
IZ
3328 sayNO;
3329 }
c277df42 3330 sw = SvTRUE(ret);
0f5d15d6 3331 logical = 0;
a0ed51b3 3332 }
080c2dec 3333 else {
3280af22 3334 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3335 cache_re(oreg);
3336 }
c277df42
IZ
3337 break;
3338 }
a0d0e21e 3339 case OPEN:
c277df42 3340 n = ARG(scan); /* which paren pair */
3280af22
NIS
3341 PL_reg_start_tmp[n] = locinput;
3342 if (n > PL_regsize)
3343 PL_regsize = n;
a0d0e21e
LW
3344 break;
3345 case CLOSE:
c277df42 3346 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3347 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3348 PL_regendp[n] = locinput - PL_bostr;
eb160463 3349 if (n > (I32)*PL_reglastparen)
3280af22 3350 *PL_reglastparen = n;
a01268b5 3351 *PL_reglastcloseparen = n;
a0d0e21e 3352 break;
c277df42
IZ
3353 case GROUPP:
3354 n = ARG(scan); /* which paren pair */
eb160463 3355 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3356 break;
3357 case IFTHEN:
2c2d71f5 3358 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3359 if (sw)
3360 next = NEXTOPER(NEXTOPER(scan));
3361 else {
3362 next = scan + ARG(scan);
3363 if (OP(next) == IFTHEN) /* Fake one. */
3364 next = NEXTOPER(NEXTOPER(next));
3365 }
3366 break;
3367 case LOGICAL:
0f5d15d6 3368 logical = scan->flags;
c277df42 3369 break;
2ab05381
IZ
3370/*******************************************************************
3371 PL_regcc contains infoblock about the innermost (...)* loop, and
3372 a pointer to the next outer infoblock.
3373
3374 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3375
3376 1) After matching X, regnode for CURLYX is processed;
3377
9041c2e3 3378 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3379 regmatch() recursively with the starting point at WHILEM node;
3380
3381 3) Each hit of WHILEM node tries to match A and Z (in the order
3382 depending on the current iteration, min/max of {min,max} and
3383 greediness). The information about where are nodes for "A"
3384 and "Z" is read from the infoblock, as is info on how many times "A"
3385 was already matched, and greediness.
3386
3387 4) After A matches, the same WHILEM node is hit again.
3388
3389 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3390 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3391 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3392 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3393 of the external loop.
3394
3395 Currently present infoblocks form a tree with a stem formed by PL_curcc
3396 and whatever it mentions via ->next, and additional attached trees
3397 corresponding to temporarily unset infoblocks as in "5" above.
3398
9041c2e3 3399 In the following picture infoblocks for outer loop of
2ab05381
IZ
3400 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3401 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3402 infoblocks are drawn below the "reset" infoblock.
3403
3404 In fact in the picture below we do not show failed matches for Z and T
3405 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3406 more obvious *why* one needs to *temporary* unset infoblocks.]
3407
3408 Matched REx position InfoBlocks Comment
3409 (Y(A)*?Z)*?T x
3410 Y(A)*?Z)*?T x <- O
3411 Y (A)*?Z)*?T x <- O
3412 Y A)*?Z)*?T x <- O <- I
3413 YA )*?Z)*?T x <- O <- I
3414 YA A)*?Z)*?T x <- O <- I
3415 YAA )*?Z)*?T x <- O <- I
3416 YAA Z)*?T x <- O # Temporary unset I
3417 I
3418
3419 YAAZ Y(A)*?Z)*?T x <- O
3420 I
3421
3422 YAAZY (A)*?Z)*?T x <- O
3423 I
3424
3425 YAAZY A)*?Z)*?T x <- O <- I
3426 I
3427
3428 YAAZYA )*?Z)*?T x <- O <- I
3429 I
3430
3431 YAAZYA Z)*?T x <- O # Temporary unset I
3432 I,I
3433
3434 YAAZYAZ )*?T x <- O
3435 I,I
3436
3437 YAAZYAZ T x # Temporary unset O
3438 O
3439 I,I
3440
3441 YAAZYAZT x
3442 O
3443 I,I
3444 *******************************************************************/
a0d0e21e
LW
3445 case CURLYX: {
3446 CURCUR cc;
3280af22 3447 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3448 /* No need to save/restore up to this paren */
3449 I32 parenfloor = scan->flags;
c277df42
IZ
3450
3451 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3452 next += ARG(next);
3280af22
NIS
3453 cc.oldcc = PL_regcc;
3454 PL_regcc = &cc;
cb434fcc
IZ
3455 /* XXXX Probably it is better to teach regpush to support
3456 parenfloor > PL_regsize... */
eb160463 3457 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3458 parenfloor = *PL_reglastparen; /* Pessimization... */
3459 cc.parenfloor = parenfloor;
a0d0e21e
LW
3460 cc.cur = -1;
3461 cc.min = ARG1(scan);
3462 cc.max = ARG2(scan);
c277df42 3463 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3464 cc.next = next;
3465 cc.minmod = minmod;
3466 cc.lastloc = 0;
3280af22 3467 PL_reginput = locinput;
a0d0e21e
LW
3468 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3469 regcpblow(cp);
3280af22 3470 PL_regcc = cc.oldcc;
4633a7c4 3471 saySAME(n);
a0d0e21e
LW
3472 }
3473 /* NOT REACHED */
3474 case WHILEM: {
3475 /*
3476 * This is really hard to understand, because after we match
3477 * what we're trying to match, we must make sure the rest of
2c2d71f5 3478 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3479 * to go back UP the parse tree by recursing ever deeper. And
3480 * if it fails, we have to reset our parent's current state
3481 * that we can try again after backing off.
3482 */
3483
c277df42 3484 CHECKPOINT cp, lastcp;
3280af22 3485 CURCUR* cc = PL_regcc;
c277df42 3486 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3ab3c9b4 3487 I32 cache_offset = 0, cache_bit = 0;
c277df42 3488
4633a7c4 3489 n = cc->cur + 1; /* how many we know we matched */
3280af22 3490 PL_reginput = locinput;
a0d0e21e 3491
a3621e74 3492 DEBUG_EXECUTE_r(
9041c2e3 3493 PerlIO_printf(Perl_debug_log,
91f3b821 3494 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3495 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3496 (long)n, (long)cc->min,
2797576d 3497 (long)cc->max, PTR2UV(cc))
c277df42 3498 );
4633a7c4 3499
a0d0e21e
LW
3500 /* If degenerate scan matches "", assume scan done. */
3501
579cf2c3 3502 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3503 PL_regcc = cc->oldcc;
2ab05381
IZ
3504 if (PL_regcc)
3505 ln = PL_regcc->cur;
a3621e74 3506 DEBUG_EXECUTE_r(
c3464db5
DD
3507 PerlIO_printf(Perl_debug_log,
3508 "%*s empty match detected, try continuation...\n",
3280af22 3509 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3510 );
a0d0e21e 3511 if (regmatch(cc->next))
4633a7c4 3512 sayYES;
2ab05381
IZ
3513 if (PL_regcc)
3514 PL_regcc->cur = ln;
3280af22 3515 PL_regcc = cc;
4633a7c4 3516 sayNO;
a0d0e21e
LW
3517 }
3518
3519 /* First just match a string of min scans. */
3520
3521 if (n < cc->min) {
3522 cc->cur = n;
3523 cc->lastloc = locinput;
4633a7c4
LW
3524 if (regmatch(cc->scan))
3525 sayYES;
3526 cc->cur = n - 1;
c277df42 3527 cc->lastloc = lastloc;
4633a7c4 3528 sayNO;
a0d0e21e
LW
3529 }
3530
2c2d71f5
JH
3531 if (scan->flags) {
3532 /* Check whether we already were at this position.
3533 Postpone detection until we know the match is not
3534 *that* much linear. */
3535 if (!PL_reg_maxiter) {
3536 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3537 PL_reg_leftiter = PL_reg_maxiter;
3538 }
3539 if (PL_reg_leftiter-- == 0) {
3ab3c9b4 3540 I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
2c2d71f5 3541 if (PL_reg_poscache) {
eb160463 3542 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3543 Renew(PL_reg_poscache, size, char);
3544 PL_reg_poscache_size = size;
3545 }
3546 Zero(PL_reg_poscache, size, char);
3547 }
3548 else {
3549 PL_reg_poscache_size = size;
3550 Newz(29, PL_reg_poscache, size, char);
3551 }
a3621e74 3552 DEBUG_EXECUTE_r(
2c2d71f5
JH
3553 PerlIO_printf(Perl_debug_log,
3554 "%sDetected a super-linear match, switching on caching%s...\n",
3555 PL_colors[4], PL_colors[5])
3556 );
3557 }
3558 if (PL_reg_leftiter < 0) {
3ab3c9b4 3559 cache_offset = locinput - PL_bostr;
2c2d71f5 3560
3ab3c9b4
HS
3561 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3562 + cache_offset * (scan->flags>>4);
3563 cache_bit = cache_offset % 8;
3564 cache_offset /= 8;
3565 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
a3621e74 3566 DEBUG_EXECUTE_r(
2c2d71f5
JH
3567 PerlIO_printf(Perl_debug_log,
3568 "%*s already tried at this position...\n",
3569 REPORT_CODE_OFF+PL_regindent*2, "")
3570 );
3ab3c9b4
HS
3571 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3572 /* cache records success */
c2b0868c
HS
3573 sayYES;
3574 else
3ab3c9b4 3575 /* cache records failure */
c2b0868c 3576 sayNO_SILENT;
2c2d71f5 3577 }
3ab3c9b4 3578 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
2c2d71f5
JH
3579 }
3580 }
3581
a0d0e21e
LW
3582 /* Prefer next over scan for minimal matching. */
3583
3584 if (cc->minmod) {
3280af22 3585 PL_regcc = cc->oldcc;
2ab05381
IZ
3586 if (PL_regcc)
3587 ln = PL_regcc->cur;
5f05dabc 3588 cp = regcppush(cc->parenfloor);
02db2b7b 3589 REGCP_SET(lastcp);
5f05dabc 3590 if (regmatch(cc->next)) {
c277df42 3591 regcpblow(cp);
3ab3c9b4 3592 CACHEsayYES; /* All done. */
5f05dabc 3593 }
02db2b7b 3594 REGCP_UNWIND(lastcp);
5f05dabc 3595 regcppop();
2ab05381
IZ
3596 if (PL_regcc)
3597 PL_regcc->cur = ln;
3280af22 3598 PL_regcc = cc;
a0d0e21e 3599
c277df42 3600 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3601 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3602 && !(PL_reg_flags & RF_warned)) {
3603 PL_reg_flags |= RF_warned;
9014280d 3604 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3605 "Complex regular subexpression recursion",
3606 REG_INFTY - 1);
c277df42 3607 }
3ab3c9b4 3608 CACHEsayNO;
c277df42 3609 }
a687059c 3610
a3621e74 3611 DEBUG_EXECUTE_r(
c3464db5
DD
3612 PerlIO_printf(Perl_debug_log,
3613 "%*s trying longer...\n",
3280af22 3614 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3615 );
a0d0e21e 3616 /* Try scanning more and see if it helps. */
3280af22 3617 PL_reginput = locinput;
a0d0e21e
LW
3618 cc->cur = n;
3619 cc->lastloc = locinput;
5f05dabc 3620 cp = regcppush(cc->parenfloor);
02db2b7b 3621 REGCP_SET(lastcp);
5f05dabc 3622 if (regmatch(cc->scan)) {
c277df42 3623 regcpblow(cp);
3ab3c9b4 3624 CACHEsayYES;
5f05dabc 3625 }
02db2b7b 3626 REGCP_UNWIND(lastcp);
5f05dabc 3627 regcppop();
4633a7c4 3628 cc->cur = n - 1;
c277df42 3629 cc->lastloc = lastloc;
3ab3c9b4 3630 CACHEsayNO;
a0d0e21e
LW
3631 }
3632
3633 /* Prefer scan over next for maximal matching. */
3634
3635 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3636 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3637 cc->cur = n;
3638 cc->lastloc = locinput;
02db2b7b 3639 REGCP_SET(lastcp);
5f05dabc 3640 if (regmatch(cc->scan)) {
c277df42 3641 regcpblow(cp);
3ab3c9b4 3642 CACHEsayYES;
5f05dabc 3643 }
02db2b7b 3644 REGCP_UNWIND(lastcp);
a0d0e21e 3645 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3646 PL_reginput = locinput;
a3621e74 3647 DEBUG_EXECUTE_r(
c3464db5
DD
3648 PerlIO_printf(Perl_debug_log,
3649 "%*s failed, try continuation...\n",
3280af22 3650 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3651 );
3652 }
9041c2e3 3653 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3654 && !(PL_reg_flags & RF_warned)) {
3280af22 3655 PL_reg_flags |= RF_warned;
9014280d 3656 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3657 "Complex regular subexpression recursion",
3658 REG_INFTY - 1);
a0d0e21e
LW
3659 }
3660
3661 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3662 PL_regcc = cc->oldcc;
2ab05381
IZ
3663 if (PL_regcc)
3664 ln = PL_regcc->cur;
a0d0e21e 3665 if (regmatch(cc->next))
3ab3c9b4 3666 CACHEsayYES;
2ab05381
IZ
3667 if (PL_regcc)
3668 PL_regcc->cur = ln;
3280af22 3669 PL_regcc = cc;
4633a7c4 3670 cc->cur = n - 1;
c277df42 3671 cc->lastloc = lastloc;
3ab3c9b4 3672 CACHEsayNO;
a0d0e21e
LW
3673 }
3674 /* NOT REACHED */
9041c2e3 3675 case BRANCHJ:
c277df42
IZ
3676 next = scan + ARG(scan);
3677 if (next == scan)
3678 next = NULL;
3679 inner = NEXTOPER(NEXTOPER(scan));
3680 goto do_branch;
9041c2e3 3681 case BRANCH:
c277df42
IZ
3682 inner = NEXTOPER(scan);
3683 do_branch:
3684 {
c277df42
IZ
3685 c1 = OP(scan);
3686 if (OP(next) != c1) /* No choice. */
3687 next = inner; /* Avoid recursion. */
a0d0e21e 3688 else {
02db2b7b
IZ
3689 I32 lastparen = *PL_reglastparen;
3690 I32 unwind1;
3691 re_unwind_branch_t *uw;
3692
3693 /* Put unwinding data on stack */
3694 unwind1 = SSNEWt(1,re_unwind_branch_t);
3695 uw = SSPTRt(unwind1,re_unwind_branch_t);
3696 uw->prev = unwind;
3697 unwind = unwind1;
3698 uw->type = ((c1 == BRANCH)
3699 ? RE_UNWIND_BRANCH
3700 : RE_UNWIND_BRANCHJ);
3701 uw->lastparen = lastparen;
3702 uw->next = next;
3703 uw->locinput = locinput;
3704 uw->nextchr = nextchr;
3705#ifdef DEBUGGING
3706 uw->regindent = ++PL_regindent;
3707#endif
c277df42 3708
02db2b7b
IZ
3709 REGCP_SET(uw->lastcp);
3710
3711 /* Now go into the first branch */
3712 next = inner;
a687059c 3713 }
a0d0e21e
LW
3714 }
3715 break;
3716 case MINMOD:
3717 minmod = 1;
3718 break;
c277df42
IZ
3719 case CURLYM:
3720 {
00db4c45 3721 I32 l = 0;
c277df42 3722 CHECKPOINT lastcp;
9041c2e3 3723
c277df42 3724 /* We suppose that the next guy does not need
0e788c72 3725 backtracking: in particular, it is of constant non-zero length,
c277df42
IZ
3726 and has no parenths to influence future backrefs. */
3727 ln = ARG1(scan); /* min to match */
3728 n = ARG2(scan); /* max to match */
c277df42
IZ
3729 paren = scan->flags;
3730 if (paren) {
3280af22
NIS
3731 if (paren > PL_regsize)
3732 PL_regsize = paren;
eb160463 3733 if (paren > (I32)*PL_reglastparen)
3280af22 3734 *PL_reglastparen = paren;
c277df42 3735 }
dc45a647 3736 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3737 if (paren)
3738 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3739 PL_reginput = locinput;
c277df42
IZ
3740 if (minmod) {
3741 minmod = 0;
3742 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3743 sayNO;
3280af22 3744 locinput = PL_reginput;
cca55fe3 3745 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3746 regnode *text_node = next;
3747
cca55fe3 3748 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3749
cca55fe3 3750 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3751 else {
cca55fe3 3752 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3753 c1 = c2 = -1000;
3754 goto assume_ok_MM;
cca55fe3
JP
3755 }
3756 else { c1 = (U8)*STRING(text_node); }
af5decee 3757 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3758 c2 = PL_fold[c1];
af5decee 3759 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3760 c2 = PL_fold_locale[c1];
3761 else
3762 c2 = c1;
3763 }
a0ed51b3
LW
3764 }
3765 else
c277df42 3766 c1 = c2 = -1000;
cca55fe3 3767 assume_ok_MM:
02db2b7b 3768 REGCP_SET(lastcp);
0e788c72 3769 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
c277df42
IZ
3770 /* If it could work, try it. */
3771 if (c1 == -1000 ||
3280af22
NIS
3772 UCHARAT(PL_reginput) == c1 ||
3773 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3774 {
3775 if (paren) {
f31a99c8 3776 if (ln) {
cf93c79d
IZ
3777 PL_regstartp[paren] =
3778 HOPc(PL_reginput, -l) - PL_bostr;
3779 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3780 }
3781 else
cf93c79d 3782 PL_regendp[paren] = -1;
c277df42
IZ
3783 }
3784 if (regmatch(next))
3785 sayYES;
02db2b7b 3786 REGCP_UNWIND(lastcp);
c277df42
IZ
3787 }
3788 /* Couldn't or didn't -- move forward. */
3280af22 3789 PL_reginput = locinput;
c277df42
IZ
3790 if (regrepeat_hard(scan, 1, &l)) {
3791 ln++;
3280af22 3792 locinput = PL_reginput;
c277df42
IZ
3793 }
3794 else
3795 sayNO;
3796 }
a0ed51b3
LW
3797 }
3798 else {
c277df42 3799 n = regrepeat_hard(scan, n, &l);
3280af22 3800 locinput = PL_reginput;
a3621e74 3801 DEBUG_EXECUTE_r(
5c0ca799 3802 PerlIO_printf(Perl_debug_log,
faccc32b 3803 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3804 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3805 (IV) n, (IV)l)
c277df42
IZ
3806 );
3807 if (n >= ln) {
cca55fe3 3808 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3809 regnode *text_node = next;
3810
cca55fe3 3811 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3812
cca55fe3 3813 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3814 else {
cca55fe3 3815 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3816 c1 = c2 = -1000;
3817 goto assume_ok_REG;
cca55fe3
JP
3818 }
3819 else { c1 = (U8)*STRING(text_node); }
3820
af5decee 3821 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3822 c2 = PL_fold[c1];
af5decee 3823 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3824 c2 = PL_fold_locale[c1];
3825 else
3826 c2 = c1;
3827 }
a0ed51b3
LW
3828 }
3829 else
c277df42
IZ
3830 c1 = c2 = -1000;
3831 }
cca55fe3 3832 assume_ok_REG:
02db2b7b 3833 REGCP_SET(lastcp);
c277df42
IZ
3834 while (n >= ln) {
3835 /* If it could work, try it. */
3836 if (c1 == -1000 ||
3280af22
NIS
3837 UCHARAT(PL_reginput) == c1 ||
3838 UCHARAT(PL_reginput) == c2)
a0ed51b3 3839 {
a3621e74 3840 DEBUG_EXECUTE_r(
c3464db5 3841 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3842 "%*s trying tail with n=%"IVdf"...\n",
3843 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3844 );
3845 if (paren) {
3846 if (n) {
cf93c79d
IZ
3847 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3848 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3849 }
a0ed51b3 3850 else
cf93c79d 3851 PL_regendp[paren] = -1;
c277df42 3852 }
a0ed51b3
LW
3853 if (regmatch(next))
3854 sayYES;
02db2b7b 3855 REGCP_UNWIND(lastcp);
a0ed51b3 3856 }
c277df42
IZ
3857 /* Couldn't or didn't -- back up. */
3858 n--;
dfe13c55 3859 locinput = HOPc(locinput, -l);
3280af22 3860 PL_reginput = locinput;
c277df42
IZ
3861 }
3862 }
3863 sayNO;
3864 break;
3865 }
3866 case CURLYN:
3867 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3868 if (paren > PL_regsize)
3869 PL_regsize = paren;
eb160463 3870 if (paren > (I32)*PL_reglastparen)
3280af22 3871 *PL_reglastparen = paren;
c277df42
IZ
3872 ln = ARG1(scan); /* min to match */
3873 n = ARG2(scan); /* max to match */
dc45a647 3874 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3875 goto repeat;
a0d0e21e 3876 case CURLY:
c277df42 3877 paren = 0;
a0d0e21e
LW
3878 ln = ARG1(scan); /* min to match */
3879 n = ARG2(scan); /* max to match */
dc45a647 3880 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3881 goto repeat;
3882 case STAR:
3883 ln = 0;
c277df42 3884 n = REG_INFTY;
a0d0e21e 3885 scan = NEXTOPER(scan);
c277df42 3886 paren = 0;
a0d0e21e
LW
3887 goto repeat;
3888 case PLUS:
c277df42
IZ
3889 ln = 1;
3890 n = REG_INFTY;
3891 scan = NEXTOPER(scan);
3892 paren = 0;
3893 repeat:
a0d0e21e
LW
3894 /*
3895 * Lookahead to avoid useless match attempts
3896 * when we know what character comes next.
3897 */
5f80c4cf
JP
3898
3899 /*
3900 * Used to only do .*x and .*?x, but now it allows
3901 * for )'s, ('s and (?{ ... })'s to be in the way
3902 * of the quantifier and the EXACT-like node. -- japhy
3903 */
3904
cca55fe3 3905 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3906 U8 *s;
3907 regnode *text_node = next;
3908
cca55fe3 3909 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3910
cca55fe3 3911 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3912 else {
cca55fe3 3913 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3914 c1 = c2 = -1000;
3915 goto assume_ok_easy;
cca55fe3
JP
3916 }
3917 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3918
3919 if (!UTF) {
3920 c2 = c1 = *s;
f65d3ee7 3921 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3922 c2 = PL_fold[c1];
f65d3ee7 3923 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3924 c2 = PL_fold_locale[c1];
1aa99e6b 3925 }
5f80c4cf 3926 else { /* UTF */
f65d3ee7 3927 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3928 STRLEN ulen1, ulen2;
89ebb4a3
JH
3929 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3930 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
3931
3932 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3933 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3934
89ebb4a3 3935 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
872c91ae
JH
3936 ckWARN(WARN_UTF8) ?
3937 0 : UTF8_ALLOW_ANY);
89ebb4a3 3938 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
872c91ae
JH
3939 ckWARN(WARN_UTF8) ?
3940 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3941 }
3942 else {
89ebb4a3 3943 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
872c91ae
JH
3944 ckWARN(WARN_UTF8) ?
3945 0 : UTF8_ALLOW_ANY);
5f80c4cf 3946 }
1aa99e6b
IH
3947 }
3948 }
bbce6d69 3949 }
a0d0e21e 3950 else
bbce6d69 3951 c1 = c2 = -1000;
cca55fe3 3952 assume_ok_easy:
3280af22 3953 PL_reginput = locinput;
a0d0e21e 3954 if (minmod) {
c277df42 3955 CHECKPOINT lastcp;
a0d0e21e
LW
3956 minmod = 0;
3957 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3958 sayNO;
a0ed51b3 3959 locinput = PL_reginput;
02db2b7b 3960 REGCP_SET(lastcp);
0fe9bf95 3961 if (c1 != -1000) {
1aa99e6b 3962 char *e; /* Should not check after this */
0fe9bf95 3963 char *old = locinput;
b2f2f093 3964 int count = 0;
0fe9bf95 3965
1aa99e6b 3966 if (n == REG_INFTY) {
0fe9bf95 3967 e = PL_regeol - 1;
1aa99e6b
IH
3968 if (do_utf8)
3969 while (UTF8_IS_CONTINUATION(*(U8*)e))
3970 e--;
3971 }
3972 else if (do_utf8) {
3973 int m = n - ln;
3974 for (e = locinput;
3975 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3976 e += UTF8SKIP(e);
3977 }
3978 else {
3979 e = locinput + n - ln;
3980 if (e >= PL_regeol)
3981 e = PL_regeol - 1;
3982 }
0fe9bf95
IZ
3983 while (1) {
3984 /* Find place 'next' could work */
1aa99e6b
IH
3985 if (!do_utf8) {
3986 if (c1 == c2) {
a8e8ab15
JH
3987 while (locinput <= e &&
3988 UCHARAT(locinput) != c1)
1aa99e6b
IH
3989 locinput++;
3990 } else {
9041c2e3 3991 while (locinput <= e
a8e8ab15
JH
3992 && UCHARAT(locinput) != c1
3993 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3994 locinput++;
3995 }
3996 count = locinput - old;
3997 }
3998 else {
3999 STRLEN len;
4000 if (c1 == c2) {
872c91ae
JH
4001 /* count initialised to
4002 * utf8_distance(old, locinput) */
b2f2f093 4003 while (locinput <= e &&
872c91ae 4004 utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4005 UTF8_MAXBYTES, &len,
872c91ae 4006 ckWARN(WARN_UTF8) ?
eb160463 4007 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 4008 locinput += len;
b2f2f093
JH
4009 count++;
4010 }
1aa99e6b 4011 } else {
872c91ae
JH
4012 /* count initialised to
4013 * utf8_distance(old, locinput) */
b2f2f093 4014 while (locinput <= e) {
872c91ae 4015 UV c = utf8n_to_uvchr((U8*)locinput,
89ebb4a3 4016 UTF8_MAXBYTES, &len,
872c91ae
JH
4017 ckWARN(WARN_UTF8) ?
4018 0 : UTF8_ALLOW_ANY);
eb160463 4019 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 4020 break;
b2f2f093
JH
4021 locinput += len;
4022 count++;
1aa99e6b
IH
4023 }
4024 }
0fe9bf95 4025 }
9041c2e3 4026 if (locinput > e)
0fe9bf95
IZ
4027 sayNO;
4028 /* PL_reginput == old now */
4029 if (locinput != old) {
4030 ln = 1; /* Did some */
1aa99e6b 4031 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
4032 sayNO;
4033 }
4034 /* PL_reginput == locinput now */
29d1e993 4035 TRYPAREN(paren, ln, locinput);
0fe9bf95 4036 PL_reginput = locinput; /* Could be reset... */
02db2b7b 4037 REGCP_UNWIND(lastcp);
0fe9bf95 4038 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
4039 old = locinput;
4040 if (do_utf8)
4041 locinput += UTF8SKIP(locinput);
4042 else
4043 locinput++;
b2f2f093 4044 count = 1;
0fe9bf95
IZ
4045 }
4046 }
4047 else
c277df42 4048 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
4049 UV c;
4050 if (c1 != -1000) {
4051 if (do_utf8)
872c91ae 4052 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4053 UTF8_MAXBYTES, 0,
872c91ae
JH
4054 ckWARN(WARN_UTF8) ?
4055 0 : UTF8_ALLOW_ANY);
1aa99e6b 4056 else
9041c2e3 4057 c = UCHARAT(PL_reginput);
2390ecbc 4058 /* If it could work, try it. */
eb160463 4059 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 4060 {
ecc99935 4061 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
4062 REGCP_UNWIND(lastcp);
4063 }
1aa99e6b 4064 }
a0d0e21e 4065 /* If it could work, try it. */
2390ecbc 4066 else if (c1 == -1000)
bbce6d69 4067 {
ecc99935 4068 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 4069 REGCP_UNWIND(lastcp);
bbce6d69 4070 }
c277df42 4071 /* Couldn't or didn't -- move forward. */
a0ed51b3 4072 PL_reginput = locinput;
a0d0e21e
LW
4073 if (regrepeat(scan, 1)) {
4074 ln++;
a0ed51b3
LW
4075 locinput = PL_reginput;
4076 }
4077 else
4633a7c4 4078 sayNO;
a0d0e21e
LW
4079 }
4080 }
4081 else {
c277df42 4082 CHECKPOINT lastcp;
a0d0e21e 4083 n = regrepeat(scan, n);
a0ed51b3 4084 locinput = PL_reginput;
22c35a8c 4085 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
7fba1cd6 4086 (OP(next) != MEOL ||
15272685
HS
4087 OP(next) == SEOL || OP(next) == EOS))
4088 {
a0d0e21e 4089 ln = n; /* why back off? */
1aeab75a
GS
4090 /* ...because $ and \Z can match before *and* after
4091 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4092 We should back off by one in this case. */
4093 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
4094 ln--;
4095 }
02db2b7b 4096 REGCP_SET(lastcp);
c277df42 4097 if (paren) {
8fa7f367 4098 UV c = 0;
c277df42 4099 while (n >= ln) {
1aa99e6b
IH
4100 if (c1 != -1000) {
4101 if (do_utf8)
872c91ae 4102 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4103 UTF8_MAXBYTES, 0,
872c91ae
JH
4104 ckWARN(WARN_UTF8) ?
4105 0 : UTF8_ALLOW_ANY);
1aa99e6b 4106 else
9041c2e3 4107 c = UCHARAT(PL_reginput);
1aa99e6b 4108 }
c277df42 4109 /* If it could work, try it. */
eb160463 4110 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4111 {
29d1e993 4112 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4113 REGCP_UNWIND(lastcp);
c277df42
IZ
4114 }
4115 /* Couldn't or didn't -- back up. */
4116 n--;
dfe13c55 4117 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 4118 }
a0ed51b3
LW
4119 }
4120 else {
8fa7f367 4121 UV c = 0;
c277df42 4122 while (n >= ln) {
1aa99e6b
IH
4123 if (c1 != -1000) {
4124 if (do_utf8)
872c91ae 4125 c = utf8n_to_uvchr((U8*)PL_reginput,
89ebb4a3 4126 UTF8_MAXBYTES, 0,
872c91ae
JH
4127 ckWARN(WARN_UTF8) ?
4128 0 : UTF8_ALLOW_ANY);
1aa99e6b 4129 else
9041c2e3 4130 c = UCHARAT(PL_reginput);
1aa99e6b 4131 }
c277df42 4132 /* If it could work, try it. */
eb160463 4133 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 4134 {
29d1e993 4135 TRYPAREN(paren, n, PL_reginput);
02db2b7b 4136 REGCP_UNWIND(lastcp);
c277df42
IZ
4137 }
4138 /* Couldn't or didn't -- back up. */
4139 n--;
dfe13c55 4140 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 4141 }
a0d0e21e
LW
4142 }
4143 }
4633a7c4 4144 sayNO;
c277df42 4145 break;
a0d0e21e 4146 case END:
0f5d15d6
IZ
4147 if (PL_reg_call_cc) {
4148 re_cc_state *cur_call_cc = PL_reg_call_cc;
4149 CURCUR *cctmp = PL_regcc;
4150 regexp *re = PL_reg_re;
4151 CHECKPOINT cp, lastcp;
4152
4153 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 4154 REGCP_SET(lastcp);
0f5d15d6
IZ
4155 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
4156 the caller. */
4157 PL_reginput = locinput; /* Make position available to
4158 the callcc. */
4159 cache_re(PL_reg_call_cc->re);
4160 PL_regcc = PL_reg_call_cc->cc;
4161 PL_reg_call_cc = PL_reg_call_cc->prev;
4162 if (regmatch(cur_call_cc->node)) {
4163 PL_reg_call_cc = cur_call_cc;
4164 regcpblow(cp);
4165 sayYES;
4166 }
02db2b7b 4167 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
4168 regcppop();
4169 PL_reg_call_cc = cur_call_cc;
4170 PL_regcc = cctmp;
4171 PL_reg_re = re;
4172 cache_re(re);
4173
a3621e74 4174 DEBUG_EXECUTE_r(
0f5d15d6
IZ
4175 PerlIO_printf(Perl_debug_log,
4176 "%*s continuation failed...\n",
4177 REPORT_CODE_OFF+PL_regindent*2, "")
4178 );
7821416a 4179 sayNO_SILENT;
0f5d15d6 4180 }
7821416a 4181 if (locinput < PL_regtill) {
a3621e74 4182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
4183 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4184 PL_colors[4],
4185 (long)(locinput - PL_reg_starttry),
4186 (long)(PL_regtill - PL_reg_starttry),
4187 PL_colors[5]));
4188 sayNO_FINAL; /* Cannot match: too short. */
4189 }
4190 PL_reginput = locinput; /* put where regtry can find it */
4191 sayYES_FINAL; /* Success! */
7e5428c5 4192 case SUCCEED:
3280af22 4193 PL_reginput = locinput; /* put where regtry can find it */
7821416a 4194 sayYES_LOUD; /* Success! */
c277df42
IZ
4195 case SUSPEND:
4196 n = 1;
9fe1d20c 4197 PL_reginput = locinput;
9041c2e3 4198 goto do_ifmatch;
a0d0e21e 4199 case UNLESSM:
c277df42 4200 n = 0;
a0ed51b3 4201 if (scan->flags) {
efb30f32
HS
4202 s = HOPBACKc(locinput, scan->flags);
4203 if (!s)
4204 goto say_yes;
4205 PL_reginput = s;
a0ed51b3
LW
4206 }
4207 else
4208 PL_reginput = locinput;
c277df42
IZ
4209 goto do_ifmatch;
4210 case IFMATCH:
4211 n = 1;
a0ed51b3 4212 if (scan->flags) {
efb30f32
HS
4213 s = HOPBACKc(locinput, scan->flags);
4214 if (!s)
4215 goto say_no;
4216 PL_reginput = s;
a0ed51b3
LW
4217 }
4218 else
4219 PL_reginput = locinput;
4220
c277df42 4221 do_ifmatch:
c277df42
IZ
4222 inner = NEXTOPER(NEXTOPER(scan));
4223 if (regmatch(inner) != n) {
4224 say_no:
4225 if (logical) {
4226 logical = 0;
4227 sw = 0;
4228 goto do_longjump;
a0ed51b3
LW
4229 }
4230 else
c277df42
IZ
4231 sayNO;
4232 }
4233 say_yes:
4234 if (logical) {
4235 logical = 0;
4236 sw = 1;
4237 }
fe44a5e8 4238 if (OP(scan) == SUSPEND) {
3280af22 4239 locinput = PL_reginput;
565764a8 4240 nextchr = UCHARAT(locinput);
fe44a5e8 4241 }
c277df42
IZ
4242 /* FALL THROUGH. */
4243 case LONGJMP:
4244 do_longjump:
4245 next = scan + ARG(scan);
4246 if (next == scan)
4247 next = NULL;
a0d0e21e
LW
4248 break;
4249 default:
b900a521 4250 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 4251 PTR2UV(scan), OP(scan));
cea2e8a9 4252 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 4253 }
02db2b7b 4254 reenter:
a0d0e21e
LW
4255 scan = next;
4256 }
a687059c 4257
a0d0e21e
LW
4258 /*
4259 * We get here only if there's trouble -- normally "case END" is
4260 * the terminating point.
4261 */
cea2e8a9 4262 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 4263 /*NOTREACHED*/
4633a7c4
LW
4264 sayNO;
4265
7821416a 4266yes_loud:
a3621e74 4267 DEBUG_EXECUTE_r(
7821416a
IZ
4268 PerlIO_printf(Perl_debug_log,
4269 "%*s %scould match...%s\n",
4270 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
4271 );
4272 goto yes;
4273yes_final:
a3621e74 4274 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7821416a 4275 PL_colors[4],PL_colors[5]));
4633a7c4
LW
4276yes:
4277#ifdef DEBUGGING
3280af22 4278 PL_regindent--;
4633a7c4 4279#endif
02db2b7b
IZ
4280
4281#if 0 /* Breaks $^R */
4282 if (unwind)
4283 regcpblow(firstcp);
4284#endif
4633a7c4
LW
4285 return 1;
4286
4287no:
a3621e74 4288 DEBUG_EXECUTE_r(
7821416a
IZ
4289 PerlIO_printf(Perl_debug_log,
4290 "%*s %sfailed...%s\n",
4291 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
4292 );
4293 goto do_no;
4294no_final:
4295do_no:
02db2b7b
IZ
4296 if (unwind) {
4297 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
4298
4299 switch (uw->type) {
4300 case RE_UNWIND_BRANCH:
4301 case RE_UNWIND_BRANCHJ:
4302 {
4303 re_unwind_branch_t *uwb = &(uw->branch);
4304 I32 lastparen = uwb->lastparen;
9041c2e3 4305
02db2b7b
IZ
4306 REGCP_UNWIND(uwb->lastcp);
4307 for (n = *PL_reglastparen; n > lastparen; n--)
4308 PL_regendp[n] = -1;
4309 *PL_reglastparen = n;
4310 scan = next = uwb->next;
9041c2e3
NIS
4311 if ( !scan ||
4312 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
4313 ? BRANCH : BRANCHJ) ) { /* Failure */
4314 unwind = uwb->prev;
4315#ifdef DEBUGGING
4316 PL_regindent--;
4317#endif
4318 goto do_no;
4319 }
4320 /* Have more choice yet. Reuse the same uwb. */
4321 /*SUPPRESS 560*/
4322 if ((n = (uwb->type == RE_UNWIND_BRANCH
4323 ? NEXT_OFF(next) : ARG(next))))
4324 next += n;
4325 else
4326 next = NULL; /* XXXX Needn't unwinding in this case... */
4327 uwb->next = next;
4328 next = NEXTOPER(scan);
4329 if (uwb->type == RE_UNWIND_BRANCHJ)
4330 next = NEXTOPER(next);
4331 locinput = uwb->locinput;
4332 nextchr = uwb->nextchr;
4333#ifdef DEBUGGING
4334 PL_regindent = uwb->regindent;
4335#endif
4336
4337 goto reenter;
4338 }
4339 /* NOT REACHED */
4340 default:
4341 Perl_croak(aTHX_ "regexp unwind memory corruption");
4342 }
4343 /* NOT REACHED */
4344 }
4633a7c4 4345#ifdef DEBUGGING
3280af22 4346 PL_regindent--;
4633a7c4 4347#endif
a0d0e21e 4348 return 0;
a687059c
LW
4349}
4350
4351/*
4352 - regrepeat - repeatedly match something simple, report how many
4353 */
4354/*
4355 * [This routine now assumes that it will only match on things of length 1.
4356 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4357 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4358 */
76e3520e 4359STATIC I32
cea2e8a9 4360S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4361{
a0d0e21e 4362 register char *scan;
a0d0e21e 4363 register I32 c;
3280af22 4364 register char *loceol = PL_regeol;
a0ed51b3 4365 register I32 hardcount = 0;
53c4c00c 4366 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4367
3280af22 4368 scan = PL_reginput;
faf11cac
HS
4369 if (max == REG_INFTY)
4370 max = I32_MAX;
4371 else if (max < loceol - scan)
a0d0e21e 4372 loceol = scan + max;
a0d0e21e 4373 switch (OP(p)) {
22c35a8c 4374 case REG_ANY:
1aa99e6b 4375 if (do_utf8) {
ffc61ed2 4376 loceol = PL_regeol;
1aa99e6b 4377 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4378 scan += UTF8SKIP(scan);
4379 hardcount++;
4380 }
4381 } else {
4382 while (scan < loceol && *scan != '\n')
4383 scan++;
a0ed51b3
LW
4384 }
4385 break;
ffc61ed2 4386 case SANY:
def8e4ea
JH
4387 if (do_utf8) {
4388 loceol = PL_regeol;
a0804c9e 4389 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4390 scan += UTF8SKIP(scan);
4391 hardcount++;
4392 }
4393 }
4394 else
4395 scan = loceol;
a0ed51b3 4396 break;
f33976b4
DB
4397 case CANY:
4398 scan = loceol;
4399 break;
090f7165
JH
4400 case EXACT: /* length of string is 1 */
4401 c = (U8)*STRING(p);
4402 while (scan < loceol && UCHARAT(scan) == c)
4403 scan++;
bbce6d69 4404 break;
4405 case EXACTF: /* length of string is 1 */
cd439c50 4406 c = (U8)*STRING(p);
bbce6d69 4407 while (scan < loceol &&
22c35a8c 4408 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4409 scan++;
4410 break;
4411 case EXACTFL: /* length of string is 1 */
3280af22 4412 PL_reg_flags |= RF_tainted;
cd439c50 4413 c = (U8)*STRING(p);
bbce6d69 4414 while (scan < loceol &&
22c35a8c 4415 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4416 scan++;
4417 break;
4418 case ANYOF:
ffc61ed2
JH
4419 if (do_utf8) {
4420 loceol = PL_regeol;
cfc92286
JH
4421 while (hardcount < max && scan < loceol &&
4422 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4423 scan += UTF8SKIP(scan);
4424 hardcount++;
4425 }
4426 } else {
7d3e948e 4427 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4428 scan++;
4429 }
a0d0e21e
LW
4430 break;
4431 case ALNUM:
1aa99e6b 4432 if (do_utf8) {
ffc61ed2 4433 loceol = PL_regeol;
8269fa76 4434 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4435 while (hardcount < max && scan < loceol &&
3568d838 4436 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4437 scan += UTF8SKIP(scan);
4438 hardcount++;
4439 }
4440 } else {
4441 while (scan < loceol && isALNUM(*scan))
4442 scan++;
a0ed51b3
LW
4443 }
4444 break;
bbce6d69 4445 case ALNUML:
3280af22 4446 PL_reg_flags |= RF_tainted;
1aa99e6b 4447 if (do_utf8) {
ffc61ed2 4448 loceol = PL_regeol;
1aa99e6b
IH
4449 while (hardcount < max && scan < loceol &&
4450 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4451 scan += UTF8SKIP(scan);
4452 hardcount++;
4453 }
4454 } else {
4455 while (scan < loceol && isALNUM_LC(*scan))
4456 scan++;
a0ed51b3
LW
4457 }
4458 break;
a0d0e21e 4459 case NALNUM:
1aa99e6b 4460 if (do_utf8) {
ffc61ed2 4461 loceol = PL_regeol;
8269fa76 4462 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4463 while (hardcount < max && scan < loceol &&
3568d838 4464 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4465 scan += UTF8SKIP(scan);
4466 hardcount++;
4467 }
4468 } else {
4469 while (scan < loceol && !isALNUM(*scan))
4470 scan++;
a0ed51b3
LW
4471 }
4472 break;
bbce6d69 4473 case NALNUML:
3280af22 4474 PL_reg_flags |= RF_tainted;
1aa99e6b 4475 if (do_utf8) {
ffc61ed2 4476 loceol = PL_regeol;
1aa99e6b
IH
4477 while (hardcount < max && scan < loceol &&
4478 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4479 scan += UTF8SKIP(scan);
4480 hardcount++;
4481 }
4482 } else {
4483 while (scan < loceol && !isALNUM_LC(*scan))
4484 scan++;
a0ed51b3
LW
4485 }
4486 break;
a0d0e21e 4487 case SPACE:
1aa99e6b 4488 if (do_utf8) {
ffc61ed2 4489 loceol = PL_regeol;
8269fa76 4490 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4491 while (hardcount < max && scan < loceol &&
3568d838
JH
4492 (*scan == ' ' ||
4493 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4494 scan += UTF8SKIP(scan);
4495 hardcount++;
4496 }
4497 } else {
4498 while (scan < loceol && isSPACE(*scan))
4499 scan++;
a0ed51b3
LW
4500 }
4501 break;
bbce6d69 4502 case SPACEL:
3280af22 4503 PL_reg_flags |= RF_tainted;
1aa99e6b 4504 if (do_utf8) {
ffc61ed2 4505 loceol = PL_regeol;
1aa99e6b 4506 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4507 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4508 scan += UTF8SKIP(scan);
4509 hardcount++;
4510 }
4511 } else {
4512 while (scan < loceol && isSPACE_LC(*scan))
4513 scan++;
a0ed51b3
LW
4514 }
4515 break;
a0d0e21e 4516 case NSPACE:
1aa99e6b 4517 if (do_utf8) {
ffc61ed2 4518 loceol = PL_regeol;
8269fa76 4519 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4520 while (hardcount < max && scan < loceol &&
3568d838
JH
4521 !(*scan == ' ' ||
4522 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4523 scan += UTF8SKIP(scan);
4524 hardcount++;
4525 }
4526 } else {
4527 while (scan < loceol && !isSPACE(*scan))
4528 scan++;
4529 break;
a0ed51b3 4530 }
bbce6d69 4531 case NSPACEL:
3280af22 4532 PL_reg_flags |= RF_tainted;
1aa99e6b 4533 if (do_utf8) {
ffc61ed2 4534 loceol = PL_regeol;
1aa99e6b 4535 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4536 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4537 scan += UTF8SKIP(scan);
4538 hardcount++;
4539 }
4540 } else {
4541 while (scan < loceol && !isSPACE_LC(*scan))
4542 scan++;
a0ed51b3
LW
4543 }
4544 break;
a0d0e21e 4545 case DIGIT:
1aa99e6b 4546 if (do_utf8) {
ffc61ed2 4547 loceol = PL_regeol;
8269fa76 4548 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4549 while (hardcount < max && scan < loceol &&
3568d838 4550 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4551 scan += UTF8SKIP(scan);
4552 hardcount++;
4553 }
4554 } else {
4555 while (scan < loceol && isDIGIT(*scan))
4556 scan++;
a0ed51b3
LW
4557 }
4558 break;
a0d0e21e 4559 case NDIGIT:
1aa99e6b 4560 if (do_utf8) {
ffc61ed2 4561 loceol = PL_regeol;
8269fa76 4562 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4563 while (hardcount < max && scan < loceol &&
3568d838 4564 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4565 scan += UTF8SKIP(scan);
4566 hardcount++;
4567 }
4568 } else {
4569 while (scan < loceol && !isDIGIT(*scan))
4570 scan++;
a0ed51b3
LW
4571 }
4572 break;
a0d0e21e
LW
4573 default: /* Called on something of 0 width. */
4574 break; /* So match right here or not at all. */
4575 }
a687059c 4576
a0ed51b3
LW
4577 if (hardcount)
4578 c = hardcount;
4579 else
4580 c = scan - PL_reginput;
3280af22 4581 PL_reginput = scan;
a687059c 4582
a3621e74
YO
4583 DEBUG_r({
4584 SV *re_debug_flags;
c277df42 4585 SV *prop = sv_newmortal();
a3621e74
YO
4586 GET_RE_DEBUG_FLAGS;
4587 DEBUG_EXECUTE_r({
c277df42 4588 regprop(prop, p);
9041c2e3
NIS
4589 PerlIO_printf(Perl_debug_log,
4590 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4591 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4592 });
a3621e74 4593 });
9041c2e3 4594
a0d0e21e 4595 return(c);
a687059c
LW
4596}
4597
4598/*
c277df42 4599 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4600 *
0e788c72 4601 * The repeater is supposed to have constant non-zero length.
c277df42
IZ
4602 */
4603
76e3520e 4604STATIC I32
cea2e8a9 4605S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4606{
b7953727 4607 register char *scan = Nullch;
c277df42 4608 register char *start;
3280af22 4609 register char *loceol = PL_regeol;
a0ed51b3 4610 I32 l = 0;
708e3b05 4611 I32 count = 0, res = 1;
a0ed51b3
LW
4612
4613 if (!max)
4614 return 0;
c277df42 4615
3280af22 4616 start = PL_reginput;
53c4c00c 4617 if (PL_reg_match_utf8) {
708e3b05 4618 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4619 if (!count++) {
4620 l = 0;
4621 while (start < PL_reginput) {
4622 l++;
4623 start += UTF8SKIP(start);
4624 }
4625 *lp = l;
4626 if (l == 0)
4627 return max;
4628 }
4629 if (count == max)
4630 return count;
4631 }
4632 }
4633 else {
708e3b05 4634 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4635 if (!count++) {
4636 *lp = l = PL_reginput - start;
4637 if (max != REG_INFTY && l*max < loceol - scan)
4638 loceol = scan + l*max;
4639 if (l == 0)
4640 return max;
c277df42
IZ
4641 }
4642 }
4643 }
708e3b05 4644 if (!res)
3280af22 4645 PL_reginput = scan;
9041c2e3 4646
a0ed51b3 4647 return count;
c277df42
IZ
4648}
4649
4650/*
ffc61ed2
JH
4651- regclass_swash - prepare the utf8 swash
4652*/
4653
4654SV *
9e55ce06 4655Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4656{
9e55ce06
JH
4657 SV *sw = NULL;
4658 SV *si = NULL;
4659 SV *alt = NULL;
ffc61ed2
JH
4660
4661 if (PL_regdata && PL_regdata->count) {
4662 U32 n = ARG(node);
4663
4664 if (PL_regdata->what[n] == 's') {
4665 SV *rv = (SV*)PL_regdata->data[n];
4666 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4667 SV **ary = AvARRAY(av);
9e55ce06 4668 SV **a, **b;
9041c2e3 4669
9e55ce06
JH
4670 /* See the end of regcomp.c:S_reglass() for
4671 * documentation of these array elements. */
4672
b11f357e
JH
4673 si = *ary;
4674 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4675 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4676
ffc61ed2
JH
4677 if (a)
4678 sw = *a;
4679 else if (si && doinit) {
4680 sw = swash_init("utf8", "", si, 1, 0);
4681 (void)av_store(av, 1, sw);
4682 }
9e55ce06
JH
4683 if (b)
4684 alt = *b;
ffc61ed2
JH
4685 }
4686 }
4687
9e55ce06
JH
4688 if (listsvp)
4689 *listsvp = si;
4690 if (altsvp)
4691 *altsvp = alt;
ffc61ed2
JH
4692
4693 return sw;
4694}
4695
4696/*
ba7b4546 4697 - reginclass - determine if a character falls into a character class
832705d4
JH
4698
4699 The n is the ANYOF regnode, the p is the target string, lenp
4700 is pointer to the maximum length of how far to go in the p
4701 (if the lenp is zero, UTF8SKIP(p) is used),
4702 do_utf8 tells whether the target string is in UTF-8.
4703
bbce6d69 4704 */
4705
76e3520e 4706STATIC bool
ba7b4546 4707S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4708{
ffc61ed2 4709 char flags = ANYOF_FLAGS(n);
bbce6d69 4710 bool match = FALSE;
cc07378b 4711 UV c = *p;
ae9ddab8 4712 STRLEN len = 0;
9e55ce06 4713 STRLEN plen;
1aa99e6b 4714
ae9ddab8 4715 if (do_utf8 && !UTF8_IS_INVARIANT(c))
89ebb4a3 4716 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
ae9ddab8 4717 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4718
0f0076b4 4719 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4720 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4721 if (lenp)
4722 *lenp = 0;
ffc61ed2 4723 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4724 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4725 match = TRUE;
bbce6d69 4726 }
3568d838 4727 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4728 match = TRUE;
ffc61ed2 4729 if (!match) {
9e55ce06
JH
4730 AV *av;
4731 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4732
4733 if (sw) {
3568d838 4734 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4735 match = TRUE;
4736 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4737 if (!match && lenp && av) {
4738 I32 i;
4739
4740 for (i = 0; i <= av_len(av); i++) {
4741 SV* sv = *av_fetch(av, i, FALSE);
4742 STRLEN len;
4743 char *s = SvPV(sv, len);
4744
061b10df 4745 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4746 *lenp = len;
4747 match = TRUE;
4748 break;
4749 }
4750 }
4751 }
4752 if (!match) {
89ebb4a3 4753 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4a623e43
JH
4754 STRLEN tmplen;
4755
9e55ce06
JH
4756 to_utf8_fold(p, tmpbuf, &tmplen);
4757 if (swash_fetch(sw, tmpbuf, do_utf8))
4758 match = TRUE;
4759 }
ffc61ed2
JH
4760 }
4761 }
bbce6d69 4762 }
9e55ce06 4763 if (match && lenp && *lenp == 0)
0f0076b4 4764 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4765 }
1aa99e6b 4766 if (!match && c < 256) {
ffc61ed2
JH
4767 if (ANYOF_BITMAP_TEST(n, c))
4768 match = TRUE;
4769 else if (flags & ANYOF_FOLD) {
eb160463 4770 U8 f;
a0ed51b3 4771
ffc61ed2
JH
4772 if (flags & ANYOF_LOCALE) {
4773 PL_reg_flags |= RF_tainted;
4774 f = PL_fold_locale[c];
4775 }
4776 else
4777 f = PL_fold[c];
4778 if (f != c && ANYOF_BITMAP_TEST(n, f))
4779 match = TRUE;
4780 }
4781
4782 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4783 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4784 if (
4785 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4786 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4787 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4788 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4789 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4790 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4791 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4792 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4793 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4794 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4795 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4796 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4797 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4798 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4799 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4800 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4801 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4802 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4803 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4804 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4805 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4806 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4807 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4808 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4809 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4810 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4811 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4812 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4813 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4814 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4815 ) /* How's that for a conditional? */
4816 {
4817 match = TRUE;
4818 }
a0ed51b3 4819 }
a0ed51b3
LW
4820 }
4821
a0ed51b3
LW
4822 return (flags & ANYOF_INVERT) ? !match : match;
4823}
161b471a 4824
dfe13c55 4825STATIC U8 *
cea2e8a9 4826S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4827{
1aa99e6b
IH
4828 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4829}
4830
4831STATIC U8 *
4832S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4833{
a0ed51b3 4834 if (off >= 0) {
1aa99e6b 4835 while (off-- && s < lim) {
ffc61ed2 4836 /* XXX could check well-formedness here */
a0ed51b3 4837 s += UTF8SKIP(s);
ffc61ed2 4838 }
a0ed51b3
LW
4839 }
4840 else {
4841 while (off++) {
1aa99e6b 4842 if (s > lim) {
a0ed51b3 4843 s--;
ffc61ed2 4844 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4845 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4846 s--;
ffc61ed2
JH
4847 }
4848 /* XXX could check well-formedness here */
a0ed51b3
LW
4849 }
4850 }
4851 }
4852 return s;
4853}
161b471a 4854
dfe13c55 4855STATIC U8 *
1aa99e6b 4856S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4857{
1aa99e6b
IH
4858 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4859}
4860
4861STATIC U8 *
4862S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4863{
4864 if (off >= 0) {
1aa99e6b 4865 while (off-- && s < lim) {
ffc61ed2 4866 /* XXX could check well-formedness here */
a0ed51b3 4867 s += UTF8SKIP(s);
ffc61ed2 4868 }
a0ed51b3
LW
4869 if (off >= 0)
4870 return 0;
4871 }
4872 else {
4873 while (off++) {
1aa99e6b 4874 if (s > lim) {
a0ed51b3 4875 s--;
ffc61ed2 4876 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4877 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4878 s--;
ffc61ed2
JH
4879 }
4880 /* XXX could check well-formedness here */
a0ed51b3
LW
4881 }
4882 else
4883 break;
4884 }
4885 if (off <= 0)
4886 return 0;
4887 }
4888 return s;
4889}
51371543 4890
51371543 4891static void
acfe0abc 4892restore_pos(pTHX_ void *arg)
51371543 4893{
06b5626a 4894 (void)arg; /* unused */
51371543
GS
4895 if (PL_reg_eval_set) {
4896 if (PL_reg_oldsaved) {
4897 PL_reg_re->subbeg = PL_reg_oldsaved;
4898 PL_reg_re->sublen = PL_reg_oldsavedlen;
ed252734
NC
4899#ifdef PERL_COPY_ON_WRITE
4900 PL_reg_re->saved_copy = PL_nrs;
4901#endif
51371543
GS
4902 RX_MATCH_COPIED_on(PL_reg_re);
4903 }
4904 PL_reg_magic->mg_len = PL_reg_oldpos;
4905 PL_reg_eval_set = 0;
4906 PL_curpm = PL_reg_oldcurpm;
4907 }
4908}
33b8afdf
JH
4909
4910STATIC void
4911S_to_utf8_substr(pTHX_ register regexp *prog)
4912{
4913 SV* sv;
4914 if (prog->float_substr && !prog->float_utf8) {
f2b990bf 4915 prog->float_utf8 = sv = newSVsv(prog->float_substr);
33b8afdf
JH
4916 sv_utf8_upgrade(sv);
4917 if (SvTAIL(prog->float_substr))
4918 SvTAIL_on(sv);
4919 if (prog->float_substr == prog->check_substr)
4920 prog->check_utf8 = sv;
4921 }
4922 if (prog->anchored_substr && !prog->anchored_utf8) {
f2b990bf 4923 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
33b8afdf
JH
4924 sv_utf8_upgrade(sv);
4925 if (SvTAIL(prog->anchored_substr))
4926 SvTAIL_on(sv);
4927 if (prog->anchored_substr == prog->check_substr)
4928 prog->check_utf8 = sv;
4929 }
4930}
4931
4932STATIC void
4933S_to_byte_substr(pTHX_ register regexp *prog)
4934{
4935 SV* sv;
4936 if (prog->float_utf8 && !prog->float_substr) {
f2b990bf 4937 prog->float_substr = sv = newSVsv(prog->float_utf8);
33b8afdf
JH
4938 if (sv_utf8_downgrade(sv, TRUE)) {
4939 if (SvTAIL(prog->float_utf8))
4940 SvTAIL_on(sv);
4941 } else {
4942 SvREFCNT_dec(sv);
4943 prog->float_substr = sv = &PL_sv_undef;
4944 }
4945 if (prog->float_utf8 == prog->check_utf8)
4946 prog->check_substr = sv;
4947 }
4948 if (prog->anchored_utf8 && !prog->anchored_substr) {
f2b990bf 4949 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
33b8afdf
JH
4950 if (sv_utf8_downgrade(sv, TRUE)) {
4951 if (SvTAIL(prog->anchored_utf8))
4952 SvTAIL_on(sv);
4953 } else {
4954 SvREFCNT_dec(sv);
4955 prog->anchored_substr = sv = &PL_sv_undef;
4956 }
4957 if (prog->anchored_utf8 == prog->check_utf8)
4958 prog->check_substr = sv;
4959 }
4960}