This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix -C option of mktables (for VMS)
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16
166f8a29
DM
17 */
18
a687059c
LW
19/* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
21 */
22
23/* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
26 */
27
e50aee73
AD
28/* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
31*/
32
b9d5759e
AD
33#ifdef PERL_EXT_RE_BUILD
34/* need to replace pregcomp et al, so enable that */
35# ifndef PERL_IN_XSUB_RE
36# define PERL_IN_XSUB_RE
37# endif
38/* need access to debugger hooks */
cad2e5aa 39# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
40# define DEBUGGING
41# endif
42#endif
43
44#ifdef PERL_IN_XSUB_RE
d06ea78c 45/* We *really* need to overwrite these symbols: */
56953603
IZ
46# define Perl_regexec_flags my_regexec
47# define Perl_regdump my_regdump
48# define Perl_regprop my_regprop
cad2e5aa 49# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
50/* *These* symbols are masked to allow static link. */
51# define Perl_pregexec my_pregexec
9041c2e3 52# define Perl_reginitcolors my_reginitcolors
490a3f88 53# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
9041c2e3 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
f2b990bf 82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
a687059c
LW
86 *
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
90 */
91#include "EXTERN.h"
864dbfa3 92#define PERL_IN_REGEXEC_C
a687059c 93#include "perl.h"
0f5d15d6 94
a687059c
LW
95#include "regcomp.h"
96
c277df42
IZ
97#define RF_tainted 1 /* tainted information used? */
98#define RF_warned 2 /* warned about big count? */
ce862d02 99#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3 100#define RF_utf8 8 /* String contains multibyte chars? */
69a95ace 101#define RF_false 16 /* odd number of nested negatives */
a0ed51b3 102
eb160463 103#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
104
105#define RS_init 1 /* eval environment created */
106#define RS_set 2 /* replsv value is set */
c277df42 107
a687059c
LW
108#ifndef STATIC
109#define STATIC static
110#endif
111
ba7b4546 112#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 113
c277df42
IZ
114/*
115 * Forwards.
116 */
117
33b8afdf 118#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 119#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 120
dfe13c55
GS
121#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
122#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
123#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
124#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
125#define HOPc(pos,off) ((char*)HOP(pos,off))
126#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 127
efb30f32 128#define HOPBACK(pos, off) ( \
e54858b0 129 (PL_reg_match_utf8) \
efb30f32
HS
130 ? reghopmaybe((U8*)pos, -off) \
131 : (pos - off >= PL_bostr) \
132 ? (U8*)(pos - off) \
133 : (U8*)NULL \
134)
135#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
136
1aa99e6b
IH
137#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
138#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
139#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
140#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
141#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
142#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
143
bfed75c6 144#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((const U8*)b); LEAVE; } } STMT_END
51371543 145
5f80c4cf 146/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
147#define JUMPABLE(rn) ( \
148 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
149 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
150 OP(rn) == PLUS || OP(rn) == MINMOD || \
151 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
152)
153
cca55fe3
JP
154#define HAS_TEXT(rn) ( \
155 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
156)
e2d8ce26 157
a84d97b6
HS
158/*
159 Search for mandatory following text node; for lookahead, the text must
160 follow but for lookbehind (rn->flags != 0) we skip to the next step.
161*/
cca55fe3 162#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 163 while (JUMPABLE(rn)) \
a84d97b6 164 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 165 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
166 else if (OP(rn) == PLUS) \
167 rn = NEXTOPER(rn); \
a84d97b6
HS
168 else if (OP(rn) == IFMATCH) \
169 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 170 else rn += NEXT_OFF(rn); \
5f80c4cf 171} STMT_END
74750237 172
acfe0abc 173static void restore_pos(pTHX_ void *arg);
51371543 174
76e3520e 175STATIC CHECKPOINT
cea2e8a9 176S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 177{
3280af22 178 int retval = PL_savestack_ix;
b1ce53c5
JH
179#define REGCP_PAREN_ELEMS 4
180 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
181 int p;
182
e49a9654
IH
183 if (paren_elems_to_push < 0)
184 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
185
a01268b5 186#define REGCP_OTHER_ELEMS 6
4b3c1a47 187 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 188 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 189/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
190 SSPUSHINT(PL_regendp[p]);
191 SSPUSHINT(PL_regstartp[p]);
3280af22 192 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
193 SSPUSHINT(p);
194 }
b1ce53c5 195/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
196 SSPUSHINT(PL_regsize);
197 SSPUSHINT(*PL_reglastparen);
a01268b5 198 SSPUSHINT(*PL_reglastcloseparen);
3280af22 199 SSPUSHPTR(PL_reginput);
41123dfd
JH
200#define REGCP_FRAME_ELEMS 2
201/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
202 * are needed for the regexp context stack bookkeeping. */
203 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 204 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 205
a0d0e21e
LW
206 return retval;
207}
208
c277df42 209/* These are needed since we do not localize EVAL nodes: */
a3621e74 210# define REGCP_SET(cp) DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, \
faccc32b 211 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 212 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 213
a3621e74 214# define REGCP_UNWIND(cp) DEBUG_EXECUTE_r(cp != PL_savestack_ix ? \
c3464db5 215 PerlIO_printf(Perl_debug_log, \
faccc32b 216 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 217 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 218
76e3520e 219STATIC char *
cea2e8a9 220S_regcppop(pTHX)
a0d0e21e 221{
b1ce53c5 222 I32 i;
a0d0e21e
LW
223 U32 paren = 0;
224 char *input;
cf93c79d 225 I32 tmps;
b1ce53c5 226
a3621e74
YO
227 GET_RE_DEBUG_FLAGS_DECL;
228
b1ce53c5 229 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 230 i = SSPOPINT;
b1ce53c5
JH
231 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
232 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 233 input = (char *) SSPOPPTR;
a01268b5 234 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
235 *PL_reglastparen = SSPOPINT;
236 PL_regsize = SSPOPINT;
b1ce53c5
JH
237
238 /* Now restore the parentheses context. */
41123dfd
JH
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 241 paren = (U32)SSPOPINT;
3280af22 242 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
243 PL_regstartp[paren] = SSPOPINT;
244 tmps = SSPOPINT;
3280af22
NIS
245 if (paren <= *PL_reglastparen)
246 PL_regendp[paren] = tmps;
a3621e74 247 DEBUG_EXECUTE_r(
c3464db5 248 PerlIO_printf(Perl_debug_log,
b900a521 249 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 250 (UV)paren, (IV)PL_regstartp[paren],
b900a521 251 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 252 (IV)PL_regendp[paren],
3280af22 253 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 254 );
a0d0e21e 255 }
a3621e74 256 DEBUG_EXECUTE_r(
eb160463 257 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 258 PerlIO_printf(Perl_debug_log,
faccc32b
JH
259 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
260 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
261 }
262 );
daf18116 263#if 1
dafc8851
JH
264 /* It would seem that the similar code in regtry()
265 * already takes care of this, and in fact it is in
266 * a better location to since this code can #if 0-ed out
267 * but the code in regtry() is needed or otherwise tests
268 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
269 * (as of patchlevel 7877) will fail. Then again,
270 * this code seems to be necessary or otherwise
271 * building DynaLoader will fail:
272 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
273 * --jhi */
eb160463
GS
274 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
275 if ((I32)paren > PL_regsize)
cf93c79d
IZ
276 PL_regstartp[paren] = -1;
277 PL_regendp[paren] = -1;
a0d0e21e 278 }
dafc8851 279#endif
a0d0e21e
LW
280 return input;
281}
282
0f5d15d6 283STATIC char *
cea2e8a9 284S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
285{
286 I32 tmp = PL_savestack_ix;
287
288 PL_savestack_ix = ss;
289 regcppop();
290 PL_savestack_ix = tmp;
942e002e 291 return Nullch;
0f5d15d6
IZ
292}
293
294typedef struct re_cc_state
295{
296 I32 ss;
297 regnode *node;
298 struct re_cc_state *prev;
299 CURCUR *cc;
300 regexp *re;
301} re_cc_state;
302
02db2b7b 303#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 304
29d1e993
HS
305#define TRYPAREN(paren, n, input) { \
306 if (paren) { \
307 if (n) { \
308 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
309 PL_regendp[paren] = input - PL_bostr; \
310 } \
311 else \
312 PL_regendp[paren] = -1; \
313 } \
314 if (regmatch(next)) \
315 sayYES; \
316 if (paren && n) \
317 PL_regendp[paren] = -1; \
318}
319
320
a687059c 321/*
e50aee73 322 * pregexec and friends
a687059c
LW
323 */
324
325/*
c277df42 326 - pregexec - match a regexp against a string
a687059c 327 */
c277df42 328I32
864dbfa3 329Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 330 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
331/* strend: pointer to null at end of string */
332/* strbeg: real beginning of string */
333/* minend: end of match must be >=minend after stringarg. */
334/* nosave: For optimizations. */
335{
336 return
9041c2e3 337 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
338 nosave ? 0 : REXEC_COPY_STR);
339}
0f5d15d6
IZ
340
341STATIC void
cea2e8a9 342S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
343{
344 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
345#ifdef DEBUGGING
346 PL_regprogram = prog->program;
347#endif
348 PL_regnpar = prog->nparens;
9041c2e3
NIS
349 PL_regdata = prog->data;
350 PL_reg_re = prog;
0f5d15d6 351}
22e551b9 352
9041c2e3 353/*
cad2e5aa
JH
354 * Need to implement the following flags for reg_anch:
355 *
356 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
357 * USE_INTUIT_ML
358 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
359 * INTUIT_AUTORITATIVE_ML
360 * INTUIT_ONCE_NOML - Intuit can match in one location only.
361 * INTUIT_ONCE_ML
362 *
363 * Another flag for this function: SECOND_TIME (so that float substrs
364 * with giant delta may be not rechecked).
365 */
366
367/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
368
2c2d71f5 369/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
370 Otherwise, only SvCUR(sv) is used to get strbeg. */
371
372/* XXXX We assume that strpos is strbeg unless sv. */
373
6eb5f6b9
JH
374/* XXXX Some places assume that there is a fixed substring.
375 An update may be needed if optimizer marks as "INTUITable"
376 RExen without fixed substrings. Similarly, it is assumed that
377 lengths of all the strings are no more than minlen, thus they
378 cannot come from lookahead.
379 (Or minlen should take into account lookahead.) */
380
2c2d71f5
JH
381/* A failure to find a constant substring means that there is no need to make
382 an expensive call to REx engine, thus we celebrate a failure. Similarly,
383 finding a substring too deep into the string means that less calls to
30944b6d
IZ
384 regtry() should be needed.
385
386 REx compiler's optimizer found 4 possible hints:
387 a) Anchored substring;
388 b) Fixed substring;
389 c) Whether we are anchored (beginning-of-line or \G);
390 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 391 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
392 string which does not contradict any of them.
393 */
2c2d71f5 394
6eb5f6b9
JH
395/* Most of decisions we do here should have been done at compile time.
396 The nodes of the REx which we used for the search should have been
397 deleted from the finite automaton. */
398
cad2e5aa
JH
399char *
400Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
401 char *strend, U32 flags, re_scream_pos_data *data)
402{
b7953727 403 register I32 start_shift = 0;
cad2e5aa 404 /* Should be nonnegative! */
b7953727 405 register I32 end_shift = 0;
2c2d71f5
JH
406 register char *s;
407 register SV *check;
a1933d95 408 char *strbeg;
cad2e5aa 409 char *t;
33b8afdf 410 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 411 I32 ml_anch;
6eb5f6b9 412 register char *other_last = Nullch; /* other substr checked before this */
b7953727 413 char *check_at = Nullch; /* check substr found at this pos */
7fba1cd6 414 I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d
IZ
415#ifdef DEBUGGING
416 char *i_strpos = strpos;
ce333219 417 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 418#endif
a3621e74
YO
419
420 GET_RE_DEBUG_FLAGS_DECL;
421
a30b2f1f 422 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 423
b8d68ded 424 if (prog->reganch & ROPT_UTF8) {
a3621e74 425 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded
JH
426 "UTF-8 regex...\n"));
427 PL_reg_flags |= RF_utf8;
428 }
429
a3621e74 430 DEBUG_EXECUTE_r({
b8d68ded 431 char *s = PL_reg_match_utf8 ?
c728cb41
JH
432 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
433 strpos;
b8d68ded
JH
434 int len = PL_reg_match_utf8 ?
435 strlen(s) : strend - strpos;
2a782b5b
JH
436 if (!PL_colorset)
437 reginitcolors();
b8d68ded 438 if (PL_reg_match_utf8)
a3621e74 439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b8d68ded 440 "UTF-8 target...\n"));
2a782b5b
JH
441 PerlIO_printf(Perl_debug_log,
442 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
443 PL_colors[4],PL_colors[5],PL_colors[0],
444 prog->precomp,
445 PL_colors[1],
446 (strlen(prog->precomp) > 60 ? "..." : ""),
447 PL_colors[0],
448 (int)(len > 60 ? 60 : len),
449 s, PL_colors[1],
450 (len > 60 ? "..." : "")
451 );
452 });
cad2e5aa 453
c344f387
JH
454 /* CHR_DIST() would be more correct here but it makes things slow. */
455 if (prog->minlen > strend - strpos) {
a3621e74 456 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 457 "String too short... [re_intuit_start]\n"));
cad2e5aa 458 goto fail;
2c2d71f5 459 }
a1933d95 460 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 461 PL_regeol = strend;
33b8afdf
JH
462 if (do_utf8) {
463 if (!prog->check_utf8 && prog->check_substr)
464 to_utf8_substr(prog);
465 check = prog->check_utf8;
466 } else {
467 if (!prog->check_substr && prog->check_utf8)
468 to_byte_substr(prog);
469 check = prog->check_substr;
470 }
471 if (check == &PL_sv_undef) {
a3621e74 472 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
473 "Non-utf string cannot match utf check string\n"));
474 goto fail;
475 }
2c2d71f5 476 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
477 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
478 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 479 && !multiline ) ); /* Check after \n? */
cad2e5aa 480
7e25d62c
JH
481 if (!ml_anch) {
482 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
483 | ROPT_IMPLICIT)) /* not a real BOL */
484 /* SvCUR is not set on references: SvRV and SvPVX overlap */
485 && sv && !SvROK(sv)
486 && (strpos != strbeg)) {
a3621e74 487 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
488 goto fail;
489 }
490 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 491 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 492 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
493 I32 slen;
494
1aa99e6b 495 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
496 if (SvTAIL(check)) {
497 slen = SvCUR(check); /* >= 1 */
cad2e5aa 498
9041c2e3 499 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 500 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 501 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 502 goto fail_finish;
cad2e5aa
JH
503 }
504 /* Now should match s[0..slen-2] */
505 slen--;
653099ff 506 if (slen && (*SvPVX(check) != *s
cad2e5aa 507 || (slen > 1
653099ff 508 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5 509 report_neq:
a3621e74 510 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
511 goto fail_finish;
512 }
cad2e5aa 513 }
653099ff
GS
514 else if (*SvPVX(check) != *s
515 || ((slen = SvCUR(check)) > 1
516 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
517 goto report_neq;
518 goto success_at_start;
7e25d62c 519 }
cad2e5aa 520 }
2c2d71f5 521 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 522 s = strpos;
2c2d71f5 523 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 524 end_shift = prog->minlen - start_shift -
653099ff 525 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 526 if (!ml_anch) {
653099ff
GS
527 I32 end = prog->check_offset_max + CHR_SVLEN(check)
528 - (SvTAIL(check) != 0);
1aa99e6b 529 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
530
531 if (end_shift < eshift)
532 end_shift = eshift;
533 }
cad2e5aa 534 }
2c2d71f5 535 else { /* Can match at random position */
cad2e5aa
JH
536 ml_anch = 0;
537 s = strpos;
2c2d71f5
JH
538 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
539 /* Should be nonnegative! */
540 end_shift = prog->minlen - start_shift -
653099ff 541 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
542 }
543
2c2d71f5 544#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 545 if (end_shift < 0)
6bbae5e6 546 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
547#endif
548
2c2d71f5
JH
549 restart:
550 /* Find a possible match in the region s..strend by looking for
551 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 552 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
553 I32 p = -1; /* Internal iterator of scream. */
554 I32 *pp = data ? data->scream_pos : &p;
555
2c2d71f5
JH
556 if (PL_screamfirst[BmRARE(check)] >= 0
557 || ( BmRARE(check) == '\n'
558 && (BmPREVIOUS(check) == SvCUR(check) - 1)
559 && SvTAIL(check) ))
9041c2e3 560 s = screaminstr(sv, check,
2c2d71f5 561 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 562 else
2c2d71f5 563 goto fail_finish;
4addbd3b
HS
564 /* we may be pointing at the wrong string */
565 if (s && RX_MATCH_COPIED(prog))
7ef91622 566 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
567 if (data)
568 *data->scream_olds = s;
569 }
f33976b4 570 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
571 s = fbm_instr((U8*)(s + start_shift),
572 (U8*)(strend - end_shift),
7fba1cd6 573 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 574 else
1aa99e6b
IH
575 s = fbm_instr(HOP3(s, start_shift, strend),
576 HOP3(strend, -end_shift, strbeg),
7fba1cd6 577 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
578
579 /* Update the count-of-usability, remove useless subpatterns,
580 unshift s. */
2c2d71f5 581
a3621e74 582 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
2c2d71f5 583 (s ? "Found" : "Did not find"),
33b8afdf 584 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 585 PL_colors[0],
7b0972df
JH
586 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
587 SvPVX(check),
2c2d71f5
JH
588 PL_colors[1], (SvTAIL(check) ? "$" : ""),
589 (s ? " at offset " : "...\n") ) );
590
591 if (!s)
592 goto fail_finish;
593
6eb5f6b9
JH
594 check_at = s;
595
2c2d71f5 596 /* Finish the diagnostic message */
a3621e74 597 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
598
599 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
600 Start with the other substr.
601 XXXX no SCREAM optimization yet - and a very coarse implementation
602 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
603 *always* match. Probably should be marked during compile...
604 Probably it is right to do no SCREAM here...
605 */
606
33b8afdf 607 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 608 /* Take into account the "other" substring. */
2c2d71f5
JH
609 /* XXXX May be hopelessly wrong for UTF... */
610 if (!other_last)
6eb5f6b9 611 other_last = strpos;
33b8afdf 612 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
613 do_other_anchored:
614 {
1aa99e6b 615 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 616 char *s1 = s;
33b8afdf 617 SV* must;
2c2d71f5 618
2c2d71f5
JH
619 t = s - prog->check_offset_max;
620 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 621 && (!do_utf8
1aa99e6b 622 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 623 && t > strpos)))
30944b6d 624 /* EMPTY */;
2c2d71f5
JH
625 else
626 t = strpos;
1aa99e6b 627 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
628 if (t < other_last) /* These positions already checked */
629 t = other_last;
1aa99e6b 630 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
631 if (last < last1)
632 last1 = last;
633 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
634 /* On end-of-str: see comment below. */
33b8afdf
JH
635 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
636 if (must == &PL_sv_undef) {
637 s = (char*)NULL;
a3621e74 638 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
639 }
640 else
641 s = fbm_instr(
642 (unsigned char*)t,
643 HOP3(HOP3(last1, prog->anchored_offset, strend)
644 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
645 must,
7fba1cd6 646 multiline ? FBMrf_MULTILINE : 0
33b8afdf 647 );
a3621e74 648 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1aa99e6b 649 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
650 (s ? "Found" : "Contradicts"),
651 PL_colors[0],
33b8afdf
JH
652 (int)(SvCUR(must)
653 - (SvTAIL(must)!=0)),
654 SvPVX(must),
655 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
656 if (!s) {
657 if (last1 >= last2) {
a3621e74 658 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
659 ", giving up...\n"));
660 goto fail_finish;
661 }
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 663 ", trying floating at offset %ld...\n",
1aa99e6b
IH
664 (long)(HOP3c(s1, 1, strend) - i_strpos)));
665 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
666 s = HOP3c(last, 1, strend);
2c2d71f5
JH
667 goto restart;
668 }
669 else {
a3621e74 670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 671 (long)(s - i_strpos)));
1aa99e6b
IH
672 t = HOP3c(s, -prog->anchored_offset, strbeg);
673 other_last = HOP3c(s, 1, strend);
30944b6d 674 s = s1;
2c2d71f5
JH
675 if (t == strpos)
676 goto try_at_start;
2c2d71f5
JH
677 goto try_at_offset;
678 }
30944b6d 679 }
2c2d71f5
JH
680 }
681 else { /* Take into account the floating substring. */
33b8afdf
JH
682 char *last, *last1;
683 char *s1 = s;
684 SV* must;
685
686 t = HOP3c(s, -start_shift, strbeg);
687 last1 = last =
688 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
689 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
690 last = HOP3c(t, prog->float_max_offset, strend);
691 s = HOP3c(t, prog->float_min_offset, strend);
692 if (s < other_last)
693 s = other_last;
2c2d71f5 694 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
695 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
696 /* fbm_instr() takes into account exact value of end-of-str
697 if the check is SvTAIL(ed). Since false positives are OK,
698 and end-of-str is not later than strend we are OK. */
699 if (must == &PL_sv_undef) {
700 s = (char*)NULL;
a3621e74 701 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
702 }
703 else
2c2d71f5 704 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
705 (unsigned char*)last + SvCUR(must)
706 - (SvTAIL(must)!=0),
7fba1cd6 707 must, multiline ? FBMrf_MULTILINE : 0);
a3621e74 708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
33b8afdf
JH
709 (s ? "Found" : "Contradicts"),
710 PL_colors[0],
711 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
712 SvPVX(must),
713 PL_colors[1], (SvTAIL(must) ? "$" : "")));
714 if (!s) {
715 if (last1 == last) {
a3621e74 716 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
717 ", giving up...\n"));
718 goto fail_finish;
2c2d71f5 719 }
a3621e74 720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
721 ", trying anchored starting at offset %ld...\n",
722 (long)(s1 + 1 - i_strpos)));
723 other_last = last;
724 s = HOP3c(t, 1, strend);
725 goto restart;
726 }
727 else {
a3621e74 728 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
729 (long)(s - i_strpos)));
730 other_last = s; /* Fix this later. --Hugo */
731 s = s1;
732 if (t == strpos)
733 goto try_at_start;
734 goto try_at_offset;
735 }
2c2d71f5 736 }
cad2e5aa 737 }
2c2d71f5
JH
738
739 t = s - prog->check_offset_max;
2c2d71f5 740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 741 && (!do_utf8
1aa99e6b
IH
742 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
743 && t > strpos))) {
2c2d71f5
JH
744 /* Fixed substring is found far enough so that the match
745 cannot start at strpos. */
746 try_at_offset:
cad2e5aa 747 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
748 /* Eventually fbm_*() should handle this, but often
749 anchored_offset is not 0, so this check will not be wasted. */
750 /* XXXX In the code below we prefer to look for "^" even in
751 presence of anchored substrings. And we search even
752 beyond the found float position. These pessimizations
753 are historical artefacts only. */
754 find_anchor:
2c2d71f5 755 while (t < strend - prog->minlen) {
cad2e5aa 756 if (*t == '\n') {
4ee3650e 757 if (t < check_at - prog->check_offset_min) {
33b8afdf 758 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
759 /* Since we moved from the found position,
760 we definitely contradict the found anchored
30944b6d
IZ
761 substr. Due to the above check we do not
762 contradict "check" substr.
763 Thus we can arrive here only if check substr
764 is float. Redo checking for "other"=="fixed".
765 */
9041c2e3 766 strpos = t + 1;
a3621e74 767 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
30944b6d
IZ
768 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
769 goto do_other_anchored;
770 }
4ee3650e
GS
771 /* We don't contradict the found floating substring. */
772 /* XXXX Why not check for STCLASS? */
cad2e5aa 773 s = t + 1;
a3621e74 774 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 775 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
776 goto set_useful;
777 }
4ee3650e
GS
778 /* Position contradicts check-string */
779 /* XXXX probably better to look for check-string
780 than for "\n", so one should lower the limit for t? */
a3621e74 781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 782 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 783 other_last = strpos = s = t + 1;
cad2e5aa
JH
784 goto restart;
785 }
786 t++;
787 }
a3621e74 788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
2c2d71f5
JH
789 PL_colors[0],PL_colors[1]));
790 goto fail_finish;
cad2e5aa 791 }
f5952150 792 else {
a3621e74 793 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
f5952150
GS
794 PL_colors[0],PL_colors[1]));
795 }
cad2e5aa
JH
796 s = t;
797 set_useful:
33b8afdf 798 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
799 }
800 else {
f5952150 801 /* The found string does not prohibit matching at strpos,
2c2d71f5 802 - no optimization of calling REx engine can be performed,
f5952150
GS
803 unless it was an MBOL and we are not after MBOL,
804 or a future STCLASS check will fail this. */
2c2d71f5
JH
805 try_at_start:
806 /* Even in this situation we may use MBOL flag if strpos is offset
807 wrt the start of the string. */
05b4157f 808 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 809 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
810 /* May be due to an implicit anchor of m{.*foo} */
811 && !(prog->reganch & ROPT_IMPLICIT))
812 {
cad2e5aa
JH
813 t = strpos;
814 goto find_anchor;
815 }
a3621e74 816 DEBUG_EXECUTE_r( if (ml_anch)
f5952150
GS
817 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
818 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 819 );
2c2d71f5 820 success_at_start:
30944b6d 821 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
822 && (do_utf8 ? (
823 prog->check_utf8 /* Could be deleted already */
824 && --BmUSEFUL(prog->check_utf8) < 0
825 && (prog->check_utf8 == prog->float_utf8)
826 ) : (
827 prog->check_substr /* Could be deleted already */
828 && --BmUSEFUL(prog->check_substr) < 0
829 && (prog->check_substr == prog->float_substr)
830 )))
66e933ab 831 {
cad2e5aa 832 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 833 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
834 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
835 if (do_utf8 ? prog->check_substr : prog->check_utf8)
836 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
837 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
838 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 839 check = Nullsv; /* abort */
cad2e5aa 840 s = strpos;
3cf5c195
IZ
841 /* XXXX This is a remnant of the old implementation. It
842 looks wasteful, since now INTUIT can use many
6eb5f6b9 843 other heuristics. */
cad2e5aa
JH
844 prog->reganch &= ~RE_USE_INTUIT;
845 }
846 else
847 s = strpos;
848 }
849
6eb5f6b9
JH
850 /* Last resort... */
851 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
852 if (prog->regstclass) {
853 /* minlen == 0 is possible if regstclass is \b or \B,
854 and the fixed substr is ''$.
855 Since minlen is already taken into account, s+1 is before strend;
856 accidentally, minlen >= 1 guaranties no false positives at s + 1
857 even for \b or \B. But (minlen? 1 : 0) below assumes that
858 regstclass does not come from lookahead... */
859 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
860 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 861 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 862 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 863 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 864 : 1);
33b8afdf 865 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 866 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 867 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
868 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
869 cl_l, strend)
870 : strend);
a1933d95 871 char *startpos = strbeg;
6eb5f6b9
JH
872
873 t = s;
9926ca43 874 cache_re(prog);
f33976b4 875 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
876 if (!s) {
877#ifdef DEBUGGING
e1ec3a88 878 const char *what = 0;
6eb5f6b9
JH
879#endif
880 if (endpos == strend) {
a3621e74 881 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
882 "Could not match STCLASS...\n") );
883 goto fail;
884 }
a3621e74 885 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 886 "This position contradicts STCLASS...\n") );
653099ff
GS
887 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
888 goto fail;
6eb5f6b9 889 /* Contradict one of substrings */
33b8afdf
JH
890 if (prog->anchored_substr || prog->anchored_utf8) {
891 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 892 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 893 hop_and_restart:
1aa99e6b 894 s = HOP3c(t, 1, strend);
66e933ab
GS
895 if (s + start_shift + end_shift > strend) {
896 /* XXXX Should be taken into account earlier? */
a3621e74 897 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
898 "Could not match STCLASS...\n") );
899 goto fail;
900 }
5e39e1e5
HS
901 if (!check)
902 goto giveup;
a3621e74 903 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 904 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
905 what, (long)(s + start_shift - i_strpos)) );
906 goto restart;
907 }
66e933ab 908 /* Have both, check_string is floating */
6eb5f6b9
JH
909 if (t + start_shift >= check_at) /* Contradicts floating=check */
910 goto retry_floating_check;
911 /* Recheck anchored substring, but not floating... */
9041c2e3 912 s = check_at;
5e39e1e5
HS
913 if (!check)
914 goto giveup;
a3621e74 915 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 916 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
917 (long)(other_last - i_strpos)) );
918 goto do_other_anchored;
919 }
60e71179
GS
920 /* Another way we could have checked stclass at the
921 current position only: */
922 if (ml_anch) {
923 s = t = t + 1;
5e39e1e5
HS
924 if (!check)
925 goto giveup;
a3621e74 926 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
927 "Looking for /%s^%s/m starting at offset %ld...\n",
928 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 929 goto try_at_offset;
66e933ab 930 }
33b8afdf 931 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 932 goto fail;
6eb5f6b9
JH
933 /* Check is floating subtring. */
934 retry_floating_check:
935 t = check_at - start_shift;
a3621e74 936 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
937 goto hop_and_restart;
938 }
b7953727 939 if (t != s) {
a3621e74 940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 941 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
942 (long)(t - i_strpos), (long)(s - i_strpos))
943 );
944 }
945 else {
a3621e74 946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
947 "Does not contradict STCLASS...\n");
948 );
949 }
6eb5f6b9 950 }
5e39e1e5 951 giveup:
a3621e74 952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
953 PL_colors[4], (check ? "Guessed" : "Giving up"),
954 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 955 return s;
2c2d71f5
JH
956
957 fail_finish: /* Substring not found */
33b8afdf
JH
958 if (prog->check_substr || prog->check_utf8) /* could be removed already */
959 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 960 fail:
a3621e74 961 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
962 PL_colors[4],PL_colors[5]));
963 return Nullch;
964}
9661b544 965
6eb5f6b9 966/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
967STATIC char *
968S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 969{
6eb5f6b9
JH
970 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
971 char *m;
d8093b23 972 STRLEN ln;
5dab1207 973 STRLEN lnc;
078c425b 974 register STRLEN uskip;
d8093b23
G
975 unsigned int c1;
976 unsigned int c2;
6eb5f6b9
JH
977 char *e;
978 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 979 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 980
6eb5f6b9
JH
981 /* We know what class it must start with. */
982 switch (OP(c)) {
6eb5f6b9 983 case ANYOF:
388cc4de 984 if (do_utf8) {
078c425b 985 while (s + (uskip = UTF8SKIP(s)) <= strend) {
388cc4de
HS
986 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
987 !UTF8_IS_INVARIANT((U8)s[0]) ?
988 reginclass(c, (U8*)s, 0, do_utf8) :
989 REGINCLASS(c, (U8*)s)) {
990 if (tmp && (norun || regtry(prog, s)))
991 goto got_it;
992 else
993 tmp = doevery;
994 }
995 else
996 tmp = 1;
078c425b 997 s += uskip;
388cc4de
HS
998 }
999 }
1000 else {
1001 while (s < strend) {
1002 STRLEN skip = 1;
1003
1004 if (REGINCLASS(c, (U8*)s) ||
1005 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1006 /* The assignment of 2 is intentional:
1007 * for the folded sharp s, the skip is 2. */
1008 (skip = SHARP_S_SKIP))) {
1009 if (tmp && (norun || regtry(prog, s)))
1010 goto got_it;
1011 else
1012 tmp = doevery;
1013 }
1014 else
1015 tmp = 1;
1016 s += skip;
1017 }
a0d0e21e 1018 }
6eb5f6b9 1019 break;
f33976b4
DB
1020 case CANY:
1021 while (s < strend) {
1022 if (tmp && (norun || regtry(prog, s)))
1023 goto got_it;
1024 else
1025 tmp = doevery;
1026 s++;
1027 }
1028 break;
6eb5f6b9 1029 case EXACTF:
5dab1207
NIS
1030 m = STRING(c);
1031 ln = STR_LEN(c); /* length to match in octets/bytes */
1032 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1033 if (UTF) {
a2a2844f 1034 STRLEN ulen1, ulen2;
5dab1207 1035 U8 *sm = (U8 *) m;
89ebb4a3
JH
1036 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1037 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a2a2844f
JH
1038
1039 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1040 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1041
89ebb4a3 1042 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
872c91ae 1043 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
89ebb4a3 1044 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
872c91ae 1045 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1046 lnc = 0;
1047 while (sm < ((U8 *) m + ln)) {
1048 lnc++;
1049 sm += UTF8SKIP(sm);
1050 }
1aa99e6b
IH
1051 }
1052 else {
1053 c1 = *(U8*)m;
1054 c2 = PL_fold[c1];
1055 }
6eb5f6b9
JH
1056 goto do_exactf;
1057 case EXACTFL:
5dab1207
NIS
1058 m = STRING(c);
1059 ln = STR_LEN(c);
1060 lnc = (I32) ln;
d8093b23 1061 c1 = *(U8*)m;
6eb5f6b9
JH
1062 c2 = PL_fold_locale[c1];
1063 do_exactf:
db12adc6 1064 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1065
6eb5f6b9
JH
1066 if (norun && e < s)
1067 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1068
60a8b682
JH
1069 /* The idea in the EXACTF* cases is to first find the
1070 * first character of the EXACTF* node and then, if
1071 * necessary, case-insensitively compare the full
1072 * text of the node. The c1 and c2 are the first
1073 * characters (though in Unicode it gets a bit
1074 * more complicated because there are more cases
7f16dd3d
JH
1075 * than just upper and lower: one needs to use
1076 * the so-called folding case for case-insensitive
1077 * matching (called "loose matching" in Unicode).
1078 * ibcmp_utf8() will do just that. */
60a8b682 1079
1aa99e6b 1080 if (do_utf8) {
575cac57 1081 UV c, f;
89ebb4a3
JH
1082 U8 tmpbuf [UTF8_MAXBYTES+1];
1083 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
575cac57 1084 STRLEN len, foldlen;
d7f013c8 1085
09091399 1086 if (c1 == c2) {
5dab1207
NIS
1087 /* Upper and lower of 1st char are equal -
1088 * probably not a "letter". */
1aa99e6b 1089 while (s <= e) {
89ebb4a3 1090 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1091 ckWARN(WARN_UTF8) ?
1092 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1093 if ( c == c1
1094 && (ln == len ||
66423254 1095 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1096 m, (char **)0, ln, (bool)UTF))
55da9344 1097 && (norun || regtry(prog, s)) )
1aa99e6b 1098 goto got_it;
80aecb99
JH
1099 else {
1100 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1101 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1102 if ( f != c
1103 && (f == c1 || f == c2)
1104 && (ln == foldlen ||
66423254
JH
1105 !ibcmp_utf8((char *) foldbuf,
1106 (char **)0, foldlen, do_utf8,
d07ddd77 1107 m,
eb160463 1108 (char **)0, ln, (bool)UTF))
80aecb99
JH
1109 && (norun || regtry(prog, s)) )
1110 goto got_it;
1111 }
1aa99e6b
IH
1112 s += len;
1113 }
09091399
JH
1114 }
1115 else {
1aa99e6b 1116 while (s <= e) {
89ebb4a3 1117 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
872c91ae
JH
1118 ckWARN(WARN_UTF8) ?
1119 0 : UTF8_ALLOW_ANY);
80aecb99 1120
60a8b682 1121 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1122 * Note that not all the possible combinations
1123 * are handled here: some of them are handled
1124 * by the standard folding rules, and some of
1125 * them (the character class or ANYOF cases)
1126 * are handled during compiletime in
1127 * regexec.c:S_regclass(). */
880bd946
JH
1128 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1129 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1130 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1131
1132 if ( (c == c1 || c == c2)
1133 && (ln == len ||
66423254 1134 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1135 m, (char **)0, ln, (bool)UTF))
55da9344 1136 && (norun || regtry(prog, s)) )
1aa99e6b 1137 goto got_it;
80aecb99
JH
1138 else {
1139 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1140 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1141 if ( f != c
1142 && (f == c1 || f == c2)
1143 && (ln == foldlen ||
a6872d42 1144 !ibcmp_utf8((char *) foldbuf,
66423254 1145 (char **)0, foldlen, do_utf8,
d07ddd77 1146 m,
eb160463 1147 (char **)0, ln, (bool)UTF))
80aecb99
JH
1148 && (norun || regtry(prog, s)) )
1149 goto got_it;
1150 }
1aa99e6b
IH
1151 s += len;
1152 }
09091399 1153 }
1aa99e6b
IH
1154 }
1155 else {
1156 if (c1 == c2)
1157 while (s <= e) {
1158 if ( *(U8*)s == c1
1159 && (ln == 1 || !(OP(c) == EXACTF
1160 ? ibcmp(s, m, ln)
1161 : ibcmp_locale(s, m, ln)))
1162 && (norun || regtry(prog, s)) )
1163 goto got_it;
1164 s++;
1165 }
1166 else
1167 while (s <= e) {
1168 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1169 && (ln == 1 || !(OP(c) == EXACTF
1170 ? ibcmp(s, m, ln)
1171 : ibcmp_locale(s, m, ln)))
1172 && (norun || regtry(prog, s)) )
1173 goto got_it;
1174 s++;
1175 }
b3c9acc1
IZ
1176 }
1177 break;
bbce6d69 1178 case BOUNDL:
3280af22 1179 PL_reg_flags |= RF_tainted;
bbce6d69 1180 /* FALL THROUGH */
a0d0e21e 1181 case BOUND:
ffc61ed2 1182 if (do_utf8) {
12d33761 1183 if (s == PL_bostr)
ffc61ed2
JH
1184 tmp = '\n';
1185 else {
b4f7163a 1186 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1187
b4f7163a 1188 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1189 }
1190 tmp = ((OP(c) == BOUND ?
9041c2e3 1191 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1192 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1193 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1194 if (tmp == !(OP(c) == BOUND ?
3568d838 1195 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1196 isALNUM_LC_utf8((U8*)s)))
1197 {
1198 tmp = !tmp;
1199 if ((norun || regtry(prog, s)))
1200 goto got_it;
1201 }
078c425b 1202 s += uskip;
a687059c 1203 }
a0d0e21e 1204 }
667bb95a 1205 else {
12d33761 1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1207 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1208 while (s < strend) {
1209 if (tmp ==
1210 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1211 tmp = !tmp;
1212 if ((norun || regtry(prog, s)))
1213 goto got_it;
1214 }
1215 s++;
a0ed51b3 1216 }
a0ed51b3 1217 }
6eb5f6b9 1218 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1219 goto got_it;
1220 break;
bbce6d69 1221 case NBOUNDL:
3280af22 1222 PL_reg_flags |= RF_tainted;
bbce6d69 1223 /* FALL THROUGH */
a0d0e21e 1224 case NBOUND:
ffc61ed2 1225 if (do_utf8) {
12d33761 1226 if (s == PL_bostr)
ffc61ed2
JH
1227 tmp = '\n';
1228 else {
b4f7163a 1229 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1230
b4f7163a 1231 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1232 }
1233 tmp = ((OP(c) == NBOUND ?
9041c2e3 1234 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1235 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1236 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2 1237 if (tmp == !(OP(c) == NBOUND ?
3568d838 1238 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1239 isALNUM_LC_utf8((U8*)s)))
1240 tmp = !tmp;
1241 else if ((norun || regtry(prog, s)))
1242 goto got_it;
078c425b 1243 s += uskip;
ffc61ed2 1244 }
a0d0e21e 1245 }
667bb95a 1246 else {
12d33761 1247 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1248 tmp = ((OP(c) == NBOUND ?
1249 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1250 while (s < strend) {
1251 if (tmp ==
1252 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1253 tmp = !tmp;
1254 else if ((norun || regtry(prog, s)))
1255 goto got_it;
1256 s++;
1257 }
a0ed51b3 1258 }
6eb5f6b9 1259 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1260 goto got_it;
1261 break;
a0d0e21e 1262 case ALNUM:
ffc61ed2 1263 if (do_utf8) {
8269fa76 1264 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1265 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1266 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1267 if (tmp && (norun || regtry(prog, s)))
1268 goto got_it;
1269 else
1270 tmp = doevery;
1271 }
bbce6d69 1272 else
ffc61ed2 1273 tmp = 1;
078c425b 1274 s += uskip;
bbce6d69 1275 }
bbce6d69 1276 }
ffc61ed2
JH
1277 else {
1278 while (s < strend) {
1279 if (isALNUM(*s)) {
1280 if (tmp && (norun || regtry(prog, s)))
1281 goto got_it;
1282 else
1283 tmp = doevery;
1284 }
a0ed51b3 1285 else
ffc61ed2
JH
1286 tmp = 1;
1287 s++;
a0ed51b3 1288 }
a0ed51b3
LW
1289 }
1290 break;
bbce6d69 1291 case ALNUML:
3280af22 1292 PL_reg_flags |= RF_tainted;
ffc61ed2 1293 if (do_utf8) {
078c425b 1294 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1295 if (isALNUM_LC_utf8((U8*)s)) {
1296 if (tmp && (norun || regtry(prog, s)))
1297 goto got_it;
1298 else
1299 tmp = doevery;
1300 }
a687059c 1301 else
ffc61ed2 1302 tmp = 1;
078c425b 1303 s += uskip;
a0d0e21e 1304 }
a0d0e21e 1305 }
ffc61ed2
JH
1306 else {
1307 while (s < strend) {
1308 if (isALNUM_LC(*s)) {
1309 if (tmp && (norun || regtry(prog, s)))
1310 goto got_it;
1311 else
1312 tmp = doevery;
1313 }
a0ed51b3 1314 else
ffc61ed2
JH
1315 tmp = 1;
1316 s++;
a0ed51b3 1317 }
a0ed51b3
LW
1318 }
1319 break;
a0d0e21e 1320 case NALNUM:
ffc61ed2 1321 if (do_utf8) {
8269fa76 1322 LOAD_UTF8_CHARCLASS(alnum,"a");
078c425b 1323 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1324 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1325 if (tmp && (norun || regtry(prog, s)))
1326 goto got_it;
1327 else
1328 tmp = doevery;
1329 }
bbce6d69 1330 else
ffc61ed2 1331 tmp = 1;
078c425b 1332 s += uskip;
bbce6d69 1333 }
bbce6d69 1334 }
ffc61ed2
JH
1335 else {
1336 while (s < strend) {
1337 if (!isALNUM(*s)) {
1338 if (tmp && (norun || regtry(prog, s)))
1339 goto got_it;
1340 else
1341 tmp = doevery;
1342 }
a0ed51b3 1343 else
ffc61ed2
JH
1344 tmp = 1;
1345 s++;
a0ed51b3 1346 }
a0ed51b3
LW
1347 }
1348 break;
bbce6d69 1349 case NALNUML:
3280af22 1350 PL_reg_flags |= RF_tainted;
ffc61ed2 1351 if (do_utf8) {
078c425b 1352 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1353 if (!isALNUM_LC_utf8((U8*)s)) {
1354 if (tmp && (norun || regtry(prog, s)))
1355 goto got_it;
1356 else
1357 tmp = doevery;
1358 }
a687059c 1359 else
ffc61ed2 1360 tmp = 1;
078c425b 1361 s += uskip;
a687059c 1362 }
a0d0e21e 1363 }
ffc61ed2
JH
1364 else {
1365 while (s < strend) {
1366 if (!isALNUM_LC(*s)) {
1367 if (tmp && (norun || regtry(prog, s)))
1368 goto got_it;
1369 else
1370 tmp = doevery;
1371 }
a0ed51b3 1372 else
ffc61ed2
JH
1373 tmp = 1;
1374 s++;
a0ed51b3 1375 }
a0ed51b3
LW
1376 }
1377 break;
a0d0e21e 1378 case SPACE:
ffc61ed2 1379 if (do_utf8) {
8269fa76 1380 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1381 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1382 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1383 if (tmp && (norun || regtry(prog, s)))
1384 goto got_it;
1385 else
1386 tmp = doevery;
1387 }
a0d0e21e 1388 else
ffc61ed2 1389 tmp = 1;
078c425b 1390 s += uskip;
2304df62 1391 }
a0d0e21e 1392 }
ffc61ed2
JH
1393 else {
1394 while (s < strend) {
1395 if (isSPACE(*s)) {
1396 if (tmp && (norun || regtry(prog, s)))
1397 goto got_it;
1398 else
1399 tmp = doevery;
1400 }
a0ed51b3 1401 else
ffc61ed2
JH
1402 tmp = 1;
1403 s++;
a0ed51b3 1404 }
a0ed51b3
LW
1405 }
1406 break;
bbce6d69 1407 case SPACEL:
3280af22 1408 PL_reg_flags |= RF_tainted;
ffc61ed2 1409 if (do_utf8) {
078c425b 1410 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1411 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1412 if (tmp && (norun || regtry(prog, s)))
1413 goto got_it;
1414 else
1415 tmp = doevery;
1416 }
bbce6d69 1417 else
ffc61ed2 1418 tmp = 1;
078c425b 1419 s += uskip;
bbce6d69 1420 }
bbce6d69 1421 }
ffc61ed2
JH
1422 else {
1423 while (s < strend) {
1424 if (isSPACE_LC(*s)) {
1425 if (tmp && (norun || regtry(prog, s)))
1426 goto got_it;
1427 else
1428 tmp = doevery;
1429 }
a0ed51b3 1430 else
ffc61ed2
JH
1431 tmp = 1;
1432 s++;
a0ed51b3 1433 }
a0ed51b3
LW
1434 }
1435 break;
a0d0e21e 1436 case NSPACE:
ffc61ed2 1437 if (do_utf8) {
8269fa76 1438 LOAD_UTF8_CHARCLASS(space," ");
078c425b 1439 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1440 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1441 if (tmp && (norun || regtry(prog, s)))
1442 goto got_it;
1443 else
1444 tmp = doevery;
1445 }
a0d0e21e 1446 else
ffc61ed2 1447 tmp = 1;
078c425b 1448 s += uskip;
a687059c 1449 }
a0d0e21e 1450 }
ffc61ed2
JH
1451 else {
1452 while (s < strend) {
1453 if (!isSPACE(*s)) {
1454 if (tmp && (norun || regtry(prog, s)))
1455 goto got_it;
1456 else
1457 tmp = doevery;
1458 }
a0ed51b3 1459 else
ffc61ed2
JH
1460 tmp = 1;
1461 s++;
a0ed51b3 1462 }
a0ed51b3
LW
1463 }
1464 break;
bbce6d69 1465 case NSPACEL:
3280af22 1466 PL_reg_flags |= RF_tainted;
ffc61ed2 1467 if (do_utf8) {
078c425b 1468 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1469 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1470 if (tmp && (norun || regtry(prog, s)))
1471 goto got_it;
1472 else
1473 tmp = doevery;
1474 }
bbce6d69 1475 else
ffc61ed2 1476 tmp = 1;
078c425b 1477 s += uskip;
bbce6d69 1478 }
bbce6d69 1479 }
ffc61ed2
JH
1480 else {
1481 while (s < strend) {
1482 if (!isSPACE_LC(*s)) {
1483 if (tmp && (norun || regtry(prog, s)))
1484 goto got_it;
1485 else
1486 tmp = doevery;
1487 }
a0ed51b3 1488 else
ffc61ed2
JH
1489 tmp = 1;
1490 s++;
a0ed51b3 1491 }
a0ed51b3
LW
1492 }
1493 break;
a0d0e21e 1494 case DIGIT:
ffc61ed2 1495 if (do_utf8) {
8269fa76 1496 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1497 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1498 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1499 if (tmp && (norun || regtry(prog, s)))
1500 goto got_it;
1501 else
1502 tmp = doevery;
1503 }
a0d0e21e 1504 else
ffc61ed2 1505 tmp = 1;
078c425b 1506 s += uskip;
2b69d0c2 1507 }
a0d0e21e 1508 }
ffc61ed2
JH
1509 else {
1510 while (s < strend) {
1511 if (isDIGIT(*s)) {
1512 if (tmp && (norun || regtry(prog, s)))
1513 goto got_it;
1514 else
1515 tmp = doevery;
1516 }
a0ed51b3 1517 else
ffc61ed2
JH
1518 tmp = 1;
1519 s++;
a0ed51b3 1520 }
a0ed51b3
LW
1521 }
1522 break;
b8c5462f
JH
1523 case DIGITL:
1524 PL_reg_flags |= RF_tainted;
ffc61ed2 1525 if (do_utf8) {
078c425b 1526 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1527 if (isDIGIT_LC_utf8((U8*)s)) {
1528 if (tmp && (norun || regtry(prog, s)))
1529 goto got_it;
1530 else
1531 tmp = doevery;
1532 }
b8c5462f 1533 else
ffc61ed2 1534 tmp = 1;
078c425b 1535 s += uskip;
b8c5462f 1536 }
b8c5462f 1537 }
ffc61ed2
JH
1538 else {
1539 while (s < strend) {
1540 if (isDIGIT_LC(*s)) {
1541 if (tmp && (norun || regtry(prog, s)))
1542 goto got_it;
1543 else
1544 tmp = doevery;
1545 }
b8c5462f 1546 else
ffc61ed2
JH
1547 tmp = 1;
1548 s++;
b8c5462f 1549 }
b8c5462f
JH
1550 }
1551 break;
a0d0e21e 1552 case NDIGIT:
ffc61ed2 1553 if (do_utf8) {
8269fa76 1554 LOAD_UTF8_CHARCLASS(digit,"0");
078c425b 1555 while (s + (uskip = UTF8SKIP(s)) <= strend) {
3568d838 1556 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1557 if (tmp && (norun || regtry(prog, s)))
1558 goto got_it;
1559 else
1560 tmp = doevery;
1561 }
a0d0e21e 1562 else
ffc61ed2 1563 tmp = 1;
078c425b 1564 s += uskip;
a687059c 1565 }
a0d0e21e 1566 }
ffc61ed2
JH
1567 else {
1568 while (s < strend) {
1569 if (!isDIGIT(*s)) {
1570 if (tmp && (norun || regtry(prog, s)))
1571 goto got_it;
1572 else
1573 tmp = doevery;
1574 }
a0ed51b3 1575 else
ffc61ed2
JH
1576 tmp = 1;
1577 s++;
a0ed51b3 1578 }
a0ed51b3
LW
1579 }
1580 break;
b8c5462f
JH
1581 case NDIGITL:
1582 PL_reg_flags |= RF_tainted;
ffc61ed2 1583 if (do_utf8) {
078c425b 1584 while (s + (uskip = UTF8SKIP(s)) <= strend) {
ffc61ed2
JH
1585 if (!isDIGIT_LC_utf8((U8*)s)) {
1586 if (tmp && (norun || regtry(prog, s)))
1587 goto got_it;
1588 else
1589 tmp = doevery;
1590 }
b8c5462f 1591 else
ffc61ed2 1592 tmp = 1;
078c425b 1593 s += uskip;
b8c5462f 1594 }
a0ed51b3 1595 }
ffc61ed2
JH
1596 else {
1597 while (s < strend) {
1598 if (!isDIGIT_LC(*s)) {
1599 if (tmp && (norun || regtry(prog, s)))
1600 goto got_it;
1601 else
1602 tmp = doevery;
1603 }
cf93c79d 1604 else
ffc61ed2
JH
1605 tmp = 1;
1606 s++;
b8c5462f 1607 }
b8c5462f
JH
1608 }
1609 break;
b3c9acc1 1610 default:
3c3eec57
GS
1611 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1612 break;
d6a28714 1613 }
6eb5f6b9
JH
1614 return 0;
1615 got_it:
1616 return s;
1617}
1618
1619/*
1620 - regexec_flags - match a regexp against a string
1621 */
1622I32
1623Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1624 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1625/* strend: pointer to null at end of string */
1626/* strbeg: real beginning of string */
1627/* minend: end of match must be >=minend after stringarg. */
1628/* data: May be used for some additional optimizations. */
1629/* nosave: For optimizations. */
1630{
6eb5f6b9
JH
1631 register char *s;
1632 register regnode *c;
1633 register char *startpos = stringarg;
6eb5f6b9
JH
1634 I32 minlen; /* must match at least this many chars */
1635 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1636 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1637 constant substr. */ /* CC */
1638 I32 end_shift = 0; /* Same for the end. */ /* CC */
1639 I32 scream_pos = -1; /* Internal iterator of scream. */
1640 char *scream_olds;
1641 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1642 bool do_utf8 = DO_UTF8(sv);
7fba1cd6 1643 I32 multiline = prog->reganch & PMf_MULTILINE;
2a782b5b 1644#ifdef DEBUGGING
9e55ce06
JH
1645 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1646 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1647#endif
a3621e74
YO
1648
1649 GET_RE_DEBUG_FLAGS_DECL;
1650
a30b2f1f 1651 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1652
1653 PL_regcc = 0;
1654
1655 cache_re(prog);
1656#ifdef DEBUGGING
aea4f609 1657 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1658#endif
1659
1660 /* Be paranoid... */
1661 if (prog == NULL || startpos == NULL) {
1662 Perl_croak(aTHX_ "NULL regexp parameter");
1663 return 0;
1664 }
1665
1666 minlen = prog->minlen;
61a36c01 1667 if (strend - startpos < minlen) {
a3621e74 1668 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1669 "String too short [regexec_flags]...\n"));
1670 goto phooey;
1aa99e6b 1671 }
6eb5f6b9 1672
6eb5f6b9
JH
1673 /* Check validity of program. */
1674 if (UCHARAT(prog->program) != REG_MAGIC) {
1675 Perl_croak(aTHX_ "corrupted regexp program");
1676 }
1677
1678 PL_reg_flags = 0;
1679 PL_reg_eval_set = 0;
1680 PL_reg_maxiter = 0;
1681
1682 if (prog->reganch & ROPT_UTF8)
1683 PL_reg_flags |= RF_utf8;
1684
1685 /* Mark beginning of line for ^ and lookbehind. */
1686 PL_regbol = startpos;
1687 PL_bostr = strbeg;
1688 PL_reg_sv = sv;
1689
1690 /* Mark end of line for $ (and such) */
1691 PL_regeol = strend;
1692
1693 /* see how far we have to get to not match where we matched before */
1694 PL_regtill = startpos+minend;
1695
1696 /* We start without call_cc context. */
1697 PL_reg_call_cc = 0;
1698
1699 /* If there is a "must appear" string, look for it. */
1700 s = startpos;
1701
1702 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1703 MAGIC *mg;
1704
1705 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1706 PL_reg_ganch = startpos;
1707 else if (sv && SvTYPE(sv) >= SVt_PVMG
1708 && SvMAGIC(sv)
14befaf4
DM
1709 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1710 && mg->mg_len >= 0) {
6eb5f6b9
JH
1711 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1712 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1713 if (s > PL_reg_ganch)
6eb5f6b9
JH
1714 goto phooey;
1715 s = PL_reg_ganch;
1716 }
1717 }
1718 else /* pos() not defined */
1719 PL_reg_ganch = strbeg;
1720 }
1721
33b8afdf 1722 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1723 re_scream_pos_data d;
1724
1725 d.scream_olds = &scream_olds;
1726 d.scream_pos = &scream_pos;
1727 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1728 if (!s) {
a3621e74 1729 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1730 goto phooey; /* not present */
3fa9c3d7 1731 }
6eb5f6b9
JH
1732 }
1733
a3621e74 1734 DEBUG_EXECUTE_r({
9e55ce06
JH
1735 char *s0 = UTF ?
1736 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1737 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1738 prog->precomp;
1739 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1740 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1741 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1742 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1743 if (!PL_colorset)
1744 reginitcolors();
1745 PerlIO_printf(Perl_debug_log,
9e55ce06 1746 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1747 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1748 len0, len0, s0,
2a782b5b 1749 PL_colors[1],
9e55ce06 1750 len0 > 60 ? "..." : "",
2a782b5b 1751 PL_colors[0],
9e55ce06
JH
1752 (int)(len1 > 60 ? 60 : len1),
1753 s1, PL_colors[1],
1754 (len1 > 60 ? "..." : "")
2a782b5b
JH
1755 );
1756 });
6eb5f6b9
JH
1757
1758 /* Simplest case: anchored match need be tried only once. */
1759 /* [unless only anchor is BOL and multiline is set] */
1760 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1761 if (s == startpos && regtry(prog, startpos))
1762 goto got_it;
7fba1cd6 1763 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1764 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1765 {
1766 char *end;
1767
1768 if (minlen)
1769 dontbother = minlen - 1;
1aa99e6b 1770 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1771 /* for multiline we only have to try after newlines */
33b8afdf 1772 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1773 if (s == startpos)
1774 goto after_try;
1775 while (1) {
1776 if (regtry(prog, s))
1777 goto got_it;
1778 after_try:
1779 if (s >= end)
1780 goto phooey;
1781 if (prog->reganch & RE_USE_INTUIT) {
1782 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1783 if (!s)
1784 goto phooey;
1785 }
1786 else
1787 s++;
1788 }
1789 } else {
1790 if (s > startpos)
1791 s--;
1792 while (s < end) {
1793 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1794 if (regtry(prog, s))
1795 goto got_it;
1796 }
1797 }
1798 }
1799 }
1800 goto phooey;
1801 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1802 if (regtry(prog, PL_reg_ganch))
1803 goto got_it;
1804 goto phooey;
1805 }
1806
1807 /* Messy cases: unanchored match. */
33b8afdf 1808 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1809 /* we have /x+whatever/ */
1810 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1811 char ch;
bf93d4cc
GS
1812#ifdef DEBUGGING
1813 int did_match = 0;
1814#endif
33b8afdf
JH
1815 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1816 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1817 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1818
1aa99e6b 1819 if (do_utf8) {
6eb5f6b9
JH
1820 while (s < strend) {
1821 if (*s == ch) {
a3621e74 1822 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1823 if (regtry(prog, s)) goto got_it;
1824 s += UTF8SKIP(s);
1825 while (s < strend && *s == ch)
1826 s += UTF8SKIP(s);
1827 }
1828 s += UTF8SKIP(s);
1829 }
1830 }
1831 else {
1832 while (s < strend) {
1833 if (*s == ch) {
a3621e74 1834 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1835 if (regtry(prog, s)) goto got_it;
1836 s++;
1837 while (s < strend && *s == ch)
1838 s++;
1839 }
1840 s++;
1841 }
1842 }
a3621e74 1843 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1844 PerlIO_printf(Perl_debug_log,
b7953727
JH
1845 "Did not find anchored character...\n")
1846 );
6eb5f6b9
JH
1847 }
1848 /*SUPPRESS 560*/
33b8afdf
JH
1849 else if (prog->anchored_substr != Nullsv
1850 || prog->anchored_utf8 != Nullsv
1851 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1852 && prog->float_max_offset < strend - s)) {
1853 SV *must;
1854 I32 back_max;
1855 I32 back_min;
1856 char *last;
6eb5f6b9 1857 char *last1; /* Last position checked before */
bf93d4cc
GS
1858#ifdef DEBUGGING
1859 int did_match = 0;
1860#endif
33b8afdf
JH
1861 if (prog->anchored_substr || prog->anchored_utf8) {
1862 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1863 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1864 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1865 back_max = back_min = prog->anchored_offset;
1866 } else {
1867 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1868 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1869 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1870 back_max = prog->float_max_offset;
1871 back_min = prog->float_min_offset;
1872 }
1873 if (must == &PL_sv_undef)
1874 /* could not downgrade utf8 check substring, so must fail */
1875 goto phooey;
1876
1877 last = HOP3c(strend, /* Cannot start after this */
1878 -(I32)(CHR_SVLEN(must)
1879 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1880
1881 if (s > PL_bostr)
1882 last1 = HOPc(s, -1);
1883 else
1884 last1 = s - 1; /* bogus */
1885
1886 /* XXXX check_substr already used to find `s', can optimize if
1887 check_substr==must. */
1888 scream_pos = -1;
1889 dontbother = end_shift;
1890 strend = HOPc(strend, -dontbother);
1891 while ( (s <= last) &&
9041c2e3 1892 ((flags & REXEC_SCREAM)
1aa99e6b 1893 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1894 end_shift, &scream_pos, 0))
1aa99e6b 1895 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1896 (unsigned char*)strend, must,
7fba1cd6 1897 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1898 /* we may be pointing at the wrong string */
1899 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1900 s = strbeg + (s - SvPVX(sv));
a3621e74 1901 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1902 if (HOPc(s, -back_max) > last1) {
1903 last1 = HOPc(s, -back_min);
1904 s = HOPc(s, -back_max);
1905 }
1906 else {
1907 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1908
1909 last1 = HOPc(s, -back_min);
1910 s = t;
1911 }
1aa99e6b 1912 if (do_utf8) {
6eb5f6b9
JH
1913 while (s <= last1) {
1914 if (regtry(prog, s))
1915 goto got_it;
1916 s += UTF8SKIP(s);
1917 }
1918 }
1919 else {
1920 while (s <= last1) {
1921 if (regtry(prog, s))
1922 goto got_it;
1923 s++;
1924 }
1925 }
1926 }
a3621e74 1927 DEBUG_EXECUTE_r(if (!did_match)
b7953727
JH
1928 PerlIO_printf(Perl_debug_log,
1929 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1930 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1931 ? "anchored" : "floating"),
1932 PL_colors[0],
1933 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1934 SvPVX(must),
b7953727
JH
1935 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1936 );
6eb5f6b9
JH
1937 goto phooey;
1938 }
155aba94 1939 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1940 if (minlen) {
1941 I32 op = (U8)OP(prog->regstclass);
66e933ab 1942 /* don't bother with what can't match */
f14c76ed
RGS
1943 if (PL_regkind[op] != EXACT && op != CANY)
1944 strend = HOPc(strend, -(minlen - 1));
1945 }
a3621e74 1946 DEBUG_EXECUTE_r({
ffc61ed2 1947 SV *prop = sv_newmortal();
9e55ce06
JH
1948 char *s0;
1949 char *s1;
1950 int len0;
1951 int len1;
1952
ffc61ed2 1953 regprop(prop, c);
9e55ce06
JH
1954 s0 = UTF ?
1955 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1956 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1957 SvPVX(prop);
1958 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1959 s1 = UTF ?
c728cb41 1960 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1961 len1 = UTF ? SvCUR(dsv1) : strend - s;
1962 PerlIO_printf(Perl_debug_log,
1963 "Matching stclass `%*.*s' against `%*.*s'\n",
1964 len0, len0, s0,
1965 len1, len1, s1);
ffc61ed2 1966 });
6eb5f6b9
JH
1967 if (find_byclass(prog, c, s, strend, startpos, 0))
1968 goto got_it;
a3621e74 1969 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1970 }
1971 else {
1972 dontbother = 0;
33b8afdf
JH
1973 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1974 /* Trim the end. */
d6a28714 1975 char *last;
33b8afdf
JH
1976 SV* float_real;
1977
1978 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1979 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1980 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1981
1982 if (flags & REXEC_SCREAM) {
33b8afdf 1983 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1984 end_shift, &scream_pos, 1); /* last one */
1985 if (!last)
ffc61ed2 1986 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1987 /* we may be pointing at the wrong string */
1988 else if (RX_MATCH_COPIED(prog))
7ef91622 1989 s = strbeg + (s - SvPVX(sv));
b8c5462f 1990 }
d6a28714
JH
1991 else {
1992 STRLEN len;
33b8afdf 1993 char *little = SvPV(float_real, len);
d6a28714 1994
33b8afdf 1995 if (SvTAIL(float_real)) {
d6a28714
JH
1996 if (memEQ(strend - len + 1, little, len - 1))
1997 last = strend - len + 1;
7fba1cd6 1998 else if (!multiline)
9041c2e3 1999 last = memEQ(strend - len, little, len)
d6a28714 2000 ? strend - len : Nullch;
b8c5462f 2001 else
d6a28714
JH
2002 goto find_last;
2003 } else {
2004 find_last:
9041c2e3 2005 if (len)
d6a28714 2006 last = rninstr(s, strend, little, little + len);
b8c5462f 2007 else
d6a28714 2008 last = strend; /* matching `$' */
b8c5462f 2009 }
b8c5462f 2010 }
bf93d4cc 2011 if (last == NULL) {
a3621e74 2012 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc
GS
2013 "%sCan't trim the tail, match fails (should not happen)%s\n",
2014 PL_colors[4],PL_colors[5]));
2015 goto phooey; /* Should not happen! */
2016 }
d6a28714
JH
2017 dontbother = strend - last + prog->float_min_offset;
2018 }
2019 if (minlen && (dontbother < minlen))
2020 dontbother = minlen - 1;
2021 strend -= dontbother; /* this one's always in bytes! */
2022 /* We don't know much -- general case. */
1aa99e6b 2023 if (do_utf8) {
d6a28714
JH
2024 for (;;) {
2025 if (regtry(prog, s))
2026 goto got_it;
2027 if (s >= strend)
2028 break;
b8c5462f 2029 s += UTF8SKIP(s);
d6a28714
JH
2030 };
2031 }
2032 else {
2033 do {
2034 if (regtry(prog, s))
2035 goto got_it;
2036 } while (s++ < strend);
2037 }
2038 }
2039
2040 /* Failure. */
2041 goto phooey;
2042
2043got_it:
2044 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2045
2046 if (PL_reg_eval_set) {
2047 /* Preserve the current value of $^R */
2048 if (oreplsv != GvSV(PL_replgv))
2049 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2050 restored, the value remains
2051 the same. */
acfe0abc 2052 restore_pos(aTHX_ 0);
d6a28714
JH
2053 }
2054
2055 /* make sure $`, $&, $', and $digit will work later */
2056 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2057 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2058 if (flags & REXEC_COPY_STR) {
2059 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2060#ifdef PERL_COPY_ON_WRITE
2061 if ((SvIsCOW(sv)
2062 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2063 if (DEBUG_C_TEST) {
2064 PerlIO_printf(Perl_debug_log,
2065 "Copy on write: regexp capture, type %d\n",
2066 (int) SvTYPE(sv));
2067 }
2068 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2069 prog->subbeg = SvPVX(prog->saved_copy);
2070 assert (SvPOKp(prog->saved_copy));
2071 } else
2072#endif
2073 {
2074 RX_MATCH_COPIED_on(prog);
2075 s = savepvn(strbeg, i);
2076 prog->subbeg = s;
2077 }
d6a28714 2078 prog->sublen = i;
d6a28714
JH
2079 }
2080 else {
2081 prog->subbeg = strbeg;
2082 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2083 }
2084 }
9041c2e3 2085
d6a28714
JH
2086 return 1;
2087
2088phooey:
a3621e74 2089 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
bf93d4cc 2090 PL_colors[4],PL_colors[5]));
d6a28714 2091 if (PL_reg_eval_set)
acfe0abc 2092 restore_pos(aTHX_ 0);
d6a28714
JH
2093 return 0;
2094}
2095
2096/*
2097 - regtry - try match at specific point
2098 */
2099STATIC I32 /* 0 failure, 1 success */
2100S_regtry(pTHX_ regexp *prog, char *startpos)
2101{
d6a28714
JH
2102 register I32 i;
2103 register I32 *sp;
2104 register I32 *ep;
2105 CHECKPOINT lastcp;
a3621e74 2106 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 2107
02db2b7b
IZ
2108#ifdef DEBUGGING
2109 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2110#endif
d6a28714
JH
2111 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2112 MAGIC *mg;
2113
2114 PL_reg_eval_set = RS_init;
a3621e74 2115 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
2116 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2117 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2118 ));
e8347627 2119 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2120 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2121 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2122 SAVETMPS;
2123 /* Apparently this is not needed, judging by wantarray. */
e8347627 2124 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2125 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2126
2127 if (PL_reg_sv) {
2128 /* Make $_ available to executed code. */
2129 if (PL_reg_sv != DEFSV) {
59f00321 2130 SAVE_DEFSV;
d6a28714 2131 DEFSV = PL_reg_sv;
b8c5462f 2132 }
d6a28714 2133
9041c2e3 2134 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2135 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2136 /* prepare for quick setting of pos */
14befaf4
DM
2137 sv_magic(PL_reg_sv, (SV*)0,
2138 PERL_MAGIC_regex_global, Nullch, 0);
2139 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2140 mg->mg_len = -1;
b8c5462f 2141 }
d6a28714
JH
2142 PL_reg_magic = mg;
2143 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2144 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2145 }
09687e5a 2146 if (!PL_reg_curpm) {
0f79a09d 2147 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2148#ifdef USE_ITHREADS
2149 {
2150 SV* repointer = newSViv(0);
577e12cc 2151 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2152 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2153 av_push(PL_regex_padav,repointer);
2154 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2155 PL_regex_pad = AvARRAY(PL_regex_padav);
2156 }
2157#endif
2158 }
aaa362c4 2159 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2160 PL_reg_oldcurpm = PL_curpm;
2161 PL_curpm = PL_reg_curpm;
2162 if (RX_MATCH_COPIED(prog)) {
2163 /* Here is a serious problem: we cannot rewrite subbeg,
2164 since it may be needed if this match fails. Thus
2165 $` inside (?{}) could fail... */
2166 PL_reg_oldsaved = prog->subbeg;
2167 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2168#ifdef PERL_COPY_ON_WRITE
2169 PL_nrs = prog->saved_copy;
2170#endif
d6a28714
JH
2171 RX_MATCH_COPIED_off(prog);
2172 }
2173 else
2174 PL_reg_oldsaved = Nullch;
2175 prog->subbeg = PL_bostr;
2176 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2177 }
2178 prog->startp[0] = startpos - PL_bostr;
2179 PL_reginput = startpos;
2180 PL_regstartp = prog->startp;
2181 PL_regendp = prog->endp;
2182 PL_reglastparen = &prog->lastparen;
a01268b5 2183 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2184 prog->lastparen = 0;
03994de8 2185 prog->lastcloseparen = 0;
d6a28714 2186 PL_regsize = 0;
a3621e74 2187 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2188 if (PL_reg_start_tmpl <= prog->nparens) {
2189 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2190 if(PL_reg_start_tmp)
2191 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2192 else
2193 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2194 }
2195
2196 /* XXXX What this code is doing here?!!! There should be no need
2197 to do this again and again, PL_reglastparen should take care of
3dd2943c 2198 this! --ilya*/
dafc8851
JH
2199
2200 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2201 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2202 * PL_reglastparen), is not needed at all by the test suite
2203 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2204 * enough, for building DynaLoader, or otherwise this
2205 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2206 * will happen. Meanwhile, this code *is* needed for the
2207 * above-mentioned test suite tests to succeed. The common theme
2208 * on those tests seems to be returning null fields from matches.
2209 * --jhi */
dafc8851 2210#if 1
d6a28714
JH
2211 sp = prog->startp;
2212 ep = prog->endp;
2213 if (prog->nparens) {
eb160463 2214 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2215 *++sp = -1;
2216 *++ep = -1;
2217 }
2218 }
dafc8851 2219#endif
02db2b7b 2220 REGCP_SET(lastcp);
d6a28714
JH
2221 if (regmatch(prog->program + 1)) {
2222 prog->endp[0] = PL_reginput - PL_bostr;
2223 return 1;
2224 }
02db2b7b 2225 REGCP_UNWIND(lastcp);
d6a28714
JH
2226 return 0;
2227}
2228
02db2b7b
IZ
2229#define RE_UNWIND_BRANCH 1
2230#define RE_UNWIND_BRANCHJ 2
2231
2232union re_unwind_t;
2233
2234typedef struct { /* XX: makes sense to enlarge it... */
2235 I32 type;
2236 I32 prev;
2237 CHECKPOINT lastcp;
2238} re_unwind_generic_t;
2239
2240typedef struct {
2241 I32 type;
2242 I32 prev;
2243 CHECKPOINT lastcp;
2244 I32 lastparen;
2245 regnode *next;
2246 char *locinput;
2247 I32 nextchr;
2248#ifdef DEBUGGING
2249 int regindent;
2250#endif
2251} re_unwind_branch_t;
2252
2253typedef union re_unwind_t {
2254 I32 type;
2255 re_unwind_generic_t generic;
2256 re_unwind_branch_t branch;
2257} re_unwind_t;
2258
8ba1375e
MJD
2259#define sayYES goto yes
2260#define sayNO goto no
e0f9d4a8 2261#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2262#define sayYES_FINAL goto yes_final
2263#define sayYES_LOUD goto yes_loud
2264#define sayNO_FINAL goto no_final
2265#define sayNO_SILENT goto do_no
2266#define saySAME(x) if (x) goto yes; else goto no
2267
a3621e74
YO
2268/* this is used to determine how far from the left messages like
2269 'failed...' are printed. Currently 29 makes these messages line
2270 up with the opcode they refer to. Earlier perls used 25 which
2271 left these messages outdented making reviewing a debug output
2272 quite difficult.
2273*/
2274#define REPORT_CODE_OFF 29
2275
2276
2277/* Make sure there is a test for this +1 options in re_tests */
2278#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2279
2280#define TRIE_CHECK_STATE_IS_ACCEPTING STMT_START { \
2281 if ( trie->states[ state ].wordnum ) { \
2282 if ( !accepted ) { \
2283 ENTER; \
2284 SAVETMPS; \
2285 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN ; \
2286 sv_accept_buff=NEWSV( 1234, \
2287 bufflen * sizeof(reg_trie_accepted) - 1 ); \
2288 SvCUR_set( sv_accept_buff, sizeof(reg_trie_accepted) ); \
2289 SvPOK_on( sv_accept_buff ); \
2290 sv_2mortal( sv_accept_buff ); \
2291 accept_buff = (reg_trie_accepted*)SvPV_nolen( sv_accept_buff );\
2292 } else { \
2293 if ( accepted >= bufflen ) { \
2294 bufflen *= 2; \
2295 accept_buff =(reg_trie_accepted*)SvGROW( sv_accept_buff, \
2296 bufflen * sizeof(reg_trie_accepted) ); \
2297 } \
2298 SvCUR_set( sv_accept_buff,SvCUR( sv_accept_buff ) \
2299 + sizeof( reg_trie_accepted ) ); \
2300 } \
2301 accept_buff[ accepted ].wordnum = trie->states[ state ].wordnum; \
2302 accept_buff[ accepted ].endpos = uc; \
2303 ++accepted; \
2304 } } STMT_END
2305
2306#define TRIE_HANDLE_CHAR STMT_START { \
2307 if ( uvc < 256 ) { \
2308 charid = trie->charmap[ uvc ]; \
2309 } else { \
2310 charid = 0; \
2311 if( trie->widecharmap ) { \
2312 SV** svpp = (SV**)NULL; \
2313 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, \
2314 sizeof( UV ), 0 ); \
2315 if ( svpp ) { \
2316 charid = (U16)SvIV( *svpp ); \
2317 } \
2318 } \
2319 } \
2320 if ( charid && \
cc601c31
YO
2321 ( base + charid > trie->uniquecharcount ) && \
2322 ( base + charid - 1 - trie->uniquecharcount < trie->lasttrans) && \
a3621e74
YO
2323 trie->trans[ base + charid - 1 - trie->uniquecharcount ].check == state ) \
2324 { \
2325 state = trie->trans[ base + charid - 1 - trie->uniquecharcount ].next; \
2326 } else { \
2327 state = 0; \
2328 } \
2329 uc += len; \
2330 } STMT_END
8ba1375e 2331
d6a28714
JH
2332/*
2333 - regmatch - main matching routine
2334 *
2335 * Conceptually the strategy is simple: check to see whether the current
2336 * node matches, call self recursively to see whether the rest matches,
2337 * and then act accordingly. In practice we make some effort to avoid
2338 * recursion, in particular by going through "ordinary" nodes (that don't
2339 * need to know whether the rest of the match failed) by a loop instead of
2340 * by recursion.
2341 */
2342/* [lwall] I've hoisted the register declarations to the outer block in order to
2343 * maybe save a little bit of pushing and popping on the stack. It also takes
2344 * advantage of machines that use a register save mask on subroutine entry.
2345 */
2346STATIC I32 /* 0 failure, 1 success */
2347S_regmatch(pTHX_ regnode *prog)
2348{
d6a28714
JH
2349 register regnode *scan; /* Current node. */
2350 regnode *next; /* Next node. */
2351 regnode *inner; /* Next node in internal branch. */
2352 register I32 nextchr; /* renamed nextchr - nextchar colides with
2353 function of same name */
2354 register I32 n; /* no or next */
b7953727
JH
2355 register I32 ln = 0; /* len or last */
2356 register char *s = Nullch; /* operand or save */
d6a28714 2357 register char *locinput = PL_reginput;
b7953727 2358 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2359 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2360 I32 unwind = 0;
a3621e74
YO
2361
2362 /* used by the trie code */
2363 SV *sv_accept_buff; /* accepting states we have traversed */
2364 reg_trie_accepted *accept_buff; /* "" */
2365 reg_trie_data *trie; /* what trie are we using right now */
2366 U32 accepted = 0; /* how many accepting states we have seen*/
2367
b7953727 2368#if 0
02db2b7b 2369 I32 firstcp = PL_savestack_ix;
b7953727 2370#endif
53c4c00c 2371 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2372#ifdef DEBUGGING
ce333219
JH
2373 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2374 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2375 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
a3621e74
YO
2376
2377 SV *re_debug_flags;
2a782b5b 2378#endif
02db2b7b 2379
a3621e74
YO
2380 GET_RE_DEBUG_FLAGS;
2381
d6a28714
JH
2382#ifdef DEBUGGING
2383 PL_regindent++;
2384#endif
2385
a3621e74 2386
d6a28714
JH
2387 /* Note that nextchr is a byte even in UTF */
2388 nextchr = UCHARAT(locinput);
2389 scan = prog;
2390 while (scan != NULL) {
8ba1375e 2391
a3621e74 2392 DEBUG_EXECUTE_r( {
d6a28714
JH
2393 SV *prop = sv_newmortal();
2394 int docolor = *PL_colors[0];
2395 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2396 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2397 /* The part of the string before starttry has one color
2398 (pref0_len chars), between starttry and current
2399 position another one (pref_len - pref0_len chars),
2400 after the current position the third one.
2401 We assume that pref0_len <= pref_len, otherwise we
2402 decrease pref0_len. */
9041c2e3 2403 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2404 ? (5 + taill) - l : locinput - PL_bostr;
2405 int pref0_len;
d6a28714 2406
df1ffd02 2407 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2408 pref_len++;
2409 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2410 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2411 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2412 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2413 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2414 l--;
d6a28714
JH
2415 if (pref0_len < 0)
2416 pref0_len = 0;
2417 if (pref0_len > pref_len)
2418 pref0_len = pref_len;
2419 regprop(prop, scan);
2a782b5b
JH
2420 {
2421 char *s0 =
f14c76ed 2422 do_utf8 && OP(scan) != CANY ?
2a782b5b 2423 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2424 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2425 locinput - pref_len;
df1ffd02 2426 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2427 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2428 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2429 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2430 locinput - pref_len + pref0_len;
df1ffd02 2431 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2432 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2433 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2434 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2435 locinput;
df1ffd02 2436 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2437 PerlIO_printf(Perl_debug_log,
2438 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2439 (IV)(locinput - PL_bostr),
2440 PL_colors[4],
2441 len0, s0,
2442 PL_colors[5],
2443 PL_colors[2],
2444 len1, s1,
2445 PL_colors[3],
2446 (docolor ? "" : "> <"),
2447 PL_colors[0],
2448 len2, s2,
2449 PL_colors[1],
2450 15 - l - pref_len + 1,
2451 "",
2452 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2453 SvPVX(prop));
2454 }
2455 });
d6a28714
JH
2456
2457 next = scan + NEXT_OFF(scan);
2458 if (next == scan)
2459 next = NULL;
2460
2461 switch (OP(scan)) {
2462 case BOL:
7fba1cd6 2463 if (locinput == PL_bostr)
d6a28714
JH
2464 {
2465 /* regtill = regbol; */
b8c5462f
JH
2466 break;
2467 }
d6a28714
JH
2468 sayNO;
2469 case MBOL:
12d33761
HS
2470 if (locinput == PL_bostr ||
2471 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2472 {
b8c5462f
JH
2473 break;
2474 }
d6a28714
JH
2475 sayNO;
2476 case SBOL:
c2a73568 2477 if (locinput == PL_bostr)
b8c5462f 2478 break;
d6a28714
JH
2479 sayNO;
2480 case GPOS:
2481 if (locinput == PL_reg_ganch)
2482 break;
2483 sayNO;
2484 case EOL:
d6a28714
JH
2485 goto seol;
2486 case MEOL:
d6a28714 2487 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2488 sayNO;
b8c5462f 2489 break;
d6a28714
JH
2490 case SEOL:
2491 seol:
2492 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2493 sayNO;
d6a28714 2494 if (PL_regeol - locinput > 1)
b8c5462f 2495 sayNO;
b8c5462f 2496 break;
d6a28714
JH
2497 case EOS:
2498 if (PL_regeol != locinput)
b8c5462f 2499 sayNO;
d6a28714 2500 break;
ffc61ed2 2501 case SANY:
d6a28714 2502 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2503 sayNO;
f33976b4
DB
2504 if (do_utf8) {
2505 locinput += PL_utf8skip[nextchr];
2506 if (locinput > PL_regeol)
2507 sayNO;
2508 nextchr = UCHARAT(locinput);
2509 }
2510 else
2511 nextchr = UCHARAT(++locinput);
2512 break;
2513 case CANY:
2514 if (!nextchr && locinput >= PL_regeol)
2515 sayNO;
b8c5462f 2516 nextchr = UCHARAT(++locinput);
a0d0e21e 2517 break;
ffc61ed2 2518 case REG_ANY:
1aa99e6b
IH
2519 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2520 sayNO;
2521 if (do_utf8) {
b8c5462f 2522 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2523 if (locinput > PL_regeol)
2524 sayNO;
a0ed51b3 2525 nextchr = UCHARAT(locinput);
a0ed51b3 2526 }
1aa99e6b
IH
2527 else
2528 nextchr = UCHARAT(++locinput);
a0ed51b3 2529 break;
a3621e74
YO
2530
2531
2532
2533 /*
2534 traverse the TRIE keeping track of all accepting states
2535 we transition through until we get to a failing node.
2536
2537 we use two slightly different pieces of code to handle
2538 the traversal depending on whether its case sensitive or
2539 not. we reuse the accept code however. (this should probably
2540 be turned into a macro.)
2541
2542 */
2543 case TRIEF:
2544 case TRIEFL:
2545 {
2546
2547 U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2548 U8 *uc = ( U8* )locinput;
2549 U32 state = 1;
2550 U16 charid = 0;
2551 U32 base = 0;
2552 UV uvc = 0;
2553 STRLEN len = 0;
2554 STRLEN foldlen = 0;
2555 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2556 U8 *uscan = (U8*)NULL;
2557 STRLEN bufflen=0;
2558 accepted = 0;
2559
2560 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2561
2562 while ( state && uc <= (U8*)PL_regeol ) {
2563
2564 TRIE_CHECK_STATE_IS_ACCEPTING;
2565
2566 base = trie->states[ state ].trans.base;
2567
2568 DEBUG_TRIE_EXECUTE_r(
2569 PerlIO_printf( Perl_debug_log,
2570 "%*s %sState: %4x, Base: %4x Accepted: %4x ",
2571 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2572 state, base, accepted );
2573 );
2574
2575 if ( base ) {
2576
2577 if ( do_utf8 || UTF ) {
2578 if ( foldlen>0 ) {
2579 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );
2580 foldlen -= len;
2581 uscan += len;
2582 len=0;
2583 } else {
2584 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2585 uvc = to_uni_fold( uvc, foldbuf, &foldlen );
2586 foldlen -= UNISKIP( uvc );
2587 uscan = foldbuf + UNISKIP( uvc );
2588 }
2589 } else {
2590 uvc = (U32)*uc;
2591 len = 1;
2592 }
2593
2594 TRIE_HANDLE_CHAR;
2595
2596 } else {
2597 state = 0;
2598 }
2599 DEBUG_TRIE_EXECUTE_r(
2600 PerlIO_printf( Perl_debug_log,
2601 "Charid:%3x CV:%4x After State: %4x%s\n",
2602 charid, uvc, state, PL_colors[5] );
2603 );
2604 }
2605 if ( !accepted ) {
2606 sayNO;
2607 } else {
2608 goto TrieAccept;
2609 }
2610 }
2611 /* unreached codepoint: we jump into the middle of the next case
2612 from previous if blocks */
2613 case TRIE:
2614 {
2615 U32 uniflags = ckWARN( WARN_UTF8 ) ? 0 : UTF8_ALLOW_ANY;
2616 U8 *uc = (U8*)locinput;
2617 U32 state = 1;
2618 U16 charid = 0;
2619 U32 base = 0;
2620 UV uvc = 0;
2621 STRLEN len = 0;
2622 STRLEN bufflen = 0;
2623 accepted = 0;
2624
2625 trie = (reg_trie_data*)PL_regdata->data[ ARG( scan ) ];
2626
2627 while ( state && uc <= (U8*)PL_regeol ) {
2628
2629 TRIE_CHECK_STATE_IS_ACCEPTING;
2630
2631 base = trie->states[ state ].trans.base;
2632
2633 DEBUG_TRIE_EXECUTE_r(
2634 PerlIO_printf( Perl_debug_log,
2635 "%*s %sState: %4x, Base: %4x Accepted: %4x ",
2636 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2637 state, base, accepted );
2638 );
2639
2640 if ( base ) {
2641
2642 if ( do_utf8 || UTF ) {
2643 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );
2644 } else {
2645 uvc = (U32)*uc;
2646 len = 1;
2647 }
2648
2649 TRIE_HANDLE_CHAR;
2650
2651 } else {
2652 state = 0;
2653 }
2654 DEBUG_TRIE_EXECUTE_r(
2655 PerlIO_printf( Perl_debug_log,
2656 "Charid:%3x CV:%4x After State: %4x%s\n",
2657 charid, uvc, state, PL_colors[5] );
2658 );
2659 }
2660 if ( !accepted ) {
2661 sayNO;
2662 }
2663 }
2664
2665
2666 /*
2667 There was at least one accepting state that we
2668 transitioned through. Presumably the number of accepting
2669 states is going to be low, typically one or two. So we
2670 simply scan through to find the one with lowest wordnum.
2671 Once we find it, we swap the last state into its place
2672 and decrement the size. We then try to match the rest of
2673 the pattern at the point where the word ends, if we
2674 succeed then we end the loop, otherwise the loop
2675 eventually terminates once all of the accepting states
2676 have been tried.
2677 */
2678 TrieAccept:
2679 {
2680 int gotit = 0;
2681
2682 if ( accepted == 1 ) {
2683 DEBUG_EXECUTE_r({
2684 SV **tmp = av_fetch( trie->words, accept_buff[ 0 ].wordnum-1, 0 );
2685 PerlIO_printf( Perl_debug_log,
2686 "%*s %sonly one match : #%d <%s>%s\n",
2687 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],
2688 accept_buff[ 0 ].wordnum,
2689 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",
2690 PL_colors[5] );
2691 });
cc601c31 2692 PL_reginput = (char *)accept_buff[ 0 ].endpos;
a3621e74
YO
2693 /* in this case we free tmps/leave before we call regmatch
2694 as we wont be using accept_buff again. */
2695 FREETMPS;
2696 LEAVE;
2697 gotit = regmatch( scan + NEXT_OFF( scan ) );
2698 } else {
2699 DEBUG_EXECUTE_r(
2700 PerlIO_printf( Perl_debug_log,"%*s %sgot %d possible matches%s\n",
2701 REPORT_CODE_OFF + PL_regindent * 2, "",PL_colors[4], accepted,
2702 PL_colors[5] );
2703 );
2704 while ( !gotit && accepted-- ) {
2705 U32 best = 0;
2706 U32 cur;
2707 for( cur = 1 ; cur <= accepted ; cur++ ) {
2708 DEBUG_TRIE_EXECUTE_r(
2709 PerlIO_printf( Perl_debug_log,
2710 "%*s %sgot %d (%d) as best, looking at %d (%d)%s\n",
2711 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2712 best, accept_buff[ best ].wordnum, cur,
2713 accept_buff[ cur ].wordnum, PL_colors[5] );
2714 );
2715
2716 if ( accept_buff[ cur ].wordnum < accept_buff[ best ].wordnum )
2717 best = cur;
2718 }
2719 DEBUG_EXECUTE_r({
2720 SV **tmp = av_fetch( trie->words, accept_buff[ best ].wordnum - 1, 0 );
2721 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at 0x%p%s\n",
2722 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],
2723 accept_buff[best].wordnum,
2724 tmp ? SvPV_nolen( *tmp ) : "not compiled under -Dr",scan,
2725 PL_colors[5] );
2726 });
2727 if ( best<accepted ) {
2728 reg_trie_accepted tmp = accept_buff[ best ];
2729 accept_buff[ best ] = accept_buff[ accepted ];
2730 accept_buff[ accepted ] = tmp;
2731 best = accepted;
2732 }
cc601c31 2733 PL_reginput = (char *)accept_buff[ best ].endpos;
a3621e74
YO
2734
2735 /*
2736 as far as I can tell we only need the SAVETMPS/FREETMPS
2737 for re's with EVAL in them but I'm leaving them in for
2738 all until I can be sure.
2739 */
2740 SAVETMPS;
2741 gotit = regmatch( scan + NEXT_OFF( scan ) ) ;
2742 FREETMPS;
2743 }
2744 FREETMPS;
2745 LEAVE;
2746 }
2747
2748 if ( gotit ) {
2749 sayYES;
2750 } else {
2751 sayNO;
2752 }
2753 }
2754 /* unreached codepoint */
d6a28714 2755 case EXACT:
cd439c50
IZ
2756 s = STRING(scan);
2757 ln = STR_LEN(scan);
eb160463 2758 if (do_utf8 != UTF) {
bc517b45 2759 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2760 char *l = locinput;
2761 char *e = s + ln;
bc517b45 2762 STRLEN ulen;
a72c7584 2763
5ff6fc6d
JH
2764 if (do_utf8) {
2765 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2766 while (s < e) {
2767 if (l >= PL_regeol)
5ff6fc6d
JH
2768 sayNO;
2769 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2770 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2771 ckWARN(WARN_UTF8) ?
2772 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2773 sayNO;
bc517b45 2774 l += ulen;
5ff6fc6d 2775 s ++;
1aa99e6b 2776 }
5ff6fc6d
JH
2777 }
2778 else {
2779 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2780 while (s < e) {
2781 if (l >= PL_regeol)
2782 sayNO;
5ff6fc6d 2783 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2784 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
872c91ae
JH
2785 ckWARN(WARN_UTF8) ?
2786 0 : UTF8_ALLOW_ANY))
1aa99e6b 2787 sayNO;
bc517b45 2788 s += ulen;
a72c7584 2789 l ++;
1aa99e6b 2790 }
5ff6fc6d 2791 }
1aa99e6b
IH
2792 locinput = l;
2793 nextchr = UCHARAT(locinput);
2794 break;
2795 }
bc517b45 2796 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2797 /* Inline the first character, for speed. */
2798 if (UCHARAT(s) != nextchr)
2799 sayNO;
2800 if (PL_regeol - locinput < ln)
2801 sayNO;
2802 if (ln > 1 && memNE(s, locinput, ln))
2803 sayNO;
2804 locinput += ln;
2805 nextchr = UCHARAT(locinput);
2806 break;
2807 case EXACTFL:
b8c5462f
JH
2808 PL_reg_flags |= RF_tainted;
2809 /* FALL THROUGH */
d6a28714 2810 case EXACTF:
cd439c50
IZ
2811 s = STRING(scan);
2812 ln = STR_LEN(scan);
d6a28714 2813
d07ddd77
JH
2814 if (do_utf8 || UTF) {
2815 /* Either target or the pattern are utf8. */
d6a28714 2816 char *l = locinput;
d07ddd77 2817 char *e = PL_regeol;
bc517b45 2818
eb160463 2819 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2820 l, &e, 0, do_utf8)) {
5486206c
JH
2821 /* One more case for the sharp s:
2822 * pack("U0U*", 0xDF) =~ /ss/i,
2823 * the 0xC3 0x9F are the UTF-8
2824 * byte sequence for the U+00DF. */
2825 if (!(do_utf8 &&
2826 toLOWER(s[0]) == 's' &&
2827 ln >= 2 &&
2828 toLOWER(s[1]) == 's' &&
2829 (U8)l[0] == 0xC3 &&
2830 e - l >= 2 &&
2831 (U8)l[1] == 0x9F))
2832 sayNO;
2833 }
d07ddd77
JH
2834 locinput = e;
2835 nextchr = UCHARAT(locinput);
2836 break;
a0ed51b3 2837 }
d6a28714 2838
bc517b45
JH
2839 /* Neither the target and the pattern are utf8. */
2840
d6a28714
JH
2841 /* Inline the first character, for speed. */
2842 if (UCHARAT(s) != nextchr &&
2843 UCHARAT(s) != ((OP(scan) == EXACTF)
2844 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2845 sayNO;
d6a28714 2846 if (PL_regeol - locinput < ln)
b8c5462f 2847 sayNO;
d6a28714
JH
2848 if (ln > 1 && (OP(scan) == EXACTF
2849 ? ibcmp(s, locinput, ln)
2850 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2851 sayNO;
d6a28714
JH
2852 locinput += ln;
2853 nextchr = UCHARAT(locinput);
a0d0e21e 2854 break;
d6a28714 2855 case ANYOF:
ffc61ed2 2856 if (do_utf8) {
9e55ce06
JH
2857 STRLEN inclasslen = PL_regeol - locinput;
2858
ba7b4546 2859 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2860 sayNO_ANYOF;
ffc61ed2
JH
2861 if (locinput >= PL_regeol)
2862 sayNO;
0f0076b4 2863 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2864 nextchr = UCHARAT(locinput);
e0f9d4a8 2865 break;
ffc61ed2
JH
2866 }
2867 else {
2868 if (nextchr < 0)
2869 nextchr = UCHARAT(locinput);
7d3e948e 2870 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2871 sayNO_ANYOF;
ffc61ed2
JH
2872 if (!nextchr && locinput >= PL_regeol)
2873 sayNO;
2874 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2875 break;
2876 }
2877 no_anyof:
2878 /* If we might have the case of the German sharp s
2879 * in a casefolding Unicode character class. */
2880
ebc501f0
JH
2881 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2882 locinput += SHARP_S_SKIP;
e0f9d4a8 2883 nextchr = UCHARAT(locinput);
ffc61ed2 2884 }
e0f9d4a8
JH
2885 else
2886 sayNO;
b8c5462f 2887 break;
d6a28714 2888 case ALNUML:
b8c5462f
JH
2889 PL_reg_flags |= RF_tainted;
2890 /* FALL THROUGH */
d6a28714 2891 case ALNUM:
b8c5462f 2892 if (!nextchr)
4633a7c4 2893 sayNO;
ffc61ed2 2894 if (do_utf8) {
ad24be35 2895 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2896 if (!(OP(scan) == ALNUM
3568d838 2897 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2898 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2899 {
2900 sayNO;
a0ed51b3 2901 }
b8c5462f 2902 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2903 nextchr = UCHARAT(locinput);
2904 break;
2905 }
ffc61ed2 2906 if (!(OP(scan) == ALNUM
d6a28714 2907 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2908 sayNO;
b8c5462f 2909 nextchr = UCHARAT(++locinput);
a0d0e21e 2910 break;
d6a28714 2911 case NALNUML:
b8c5462f
JH
2912 PL_reg_flags |= RF_tainted;
2913 /* FALL THROUGH */
d6a28714
JH
2914 case NALNUM:
2915 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2916 sayNO;
ffc61ed2 2917 if (do_utf8) {
8269fa76 2918 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2919 if (OP(scan) == NALNUM
3568d838 2920 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2921 : isALNUM_LC_utf8((U8*)locinput))
2922 {
b8c5462f 2923 sayNO;
d6a28714 2924 }
b8c5462f
JH
2925 locinput += PL_utf8skip[nextchr];
2926 nextchr = UCHARAT(locinput);
2927 break;
2928 }
ffc61ed2 2929 if (OP(scan) == NALNUM
d6a28714 2930 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2931 sayNO;
76e3520e 2932 nextchr = UCHARAT(++locinput);
a0d0e21e 2933 break;
d6a28714
JH
2934 case BOUNDL:
2935 case NBOUNDL:
3280af22 2936 PL_reg_flags |= RF_tainted;
bbce6d69 2937 /* FALL THROUGH */
d6a28714
JH
2938 case BOUND:
2939 case NBOUND:
2940 /* was last char in word? */
ffc61ed2 2941 if (do_utf8) {
12d33761
HS
2942 if (locinput == PL_bostr)
2943 ln = '\n';
ffc61ed2 2944 else {
b4f7163a 2945 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2946
b4f7163a 2947 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2948 }
2949 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2950 ln = isALNUM_uni(ln);
8269fa76 2951 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2952 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2953 }
2954 else {
9041c2e3 2955 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2956 n = isALNUM_LC_utf8((U8*)locinput);
2957 }
a0ed51b3 2958 }
d6a28714 2959 else {
12d33761
HS
2960 ln = (locinput != PL_bostr) ?
2961 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2962 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2963 ln = isALNUM(ln);
2964 n = isALNUM(nextchr);
2965 }
2966 else {
2967 ln = isALNUM_LC(ln);
2968 n = isALNUM_LC(nextchr);
2969 }
d6a28714 2970 }
ffc61ed2
JH
2971 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2972 OP(scan) == BOUNDL))
2973 sayNO;
a0ed51b3 2974 break;
d6a28714 2975 case SPACEL:
3280af22 2976 PL_reg_flags |= RF_tainted;
bbce6d69 2977 /* FALL THROUGH */
d6a28714 2978 case SPACE:
9442cb0e 2979 if (!nextchr)
4633a7c4 2980 sayNO;
1aa99e6b 2981 if (do_utf8) {
fd400ab9 2982 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2983 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2984 if (!(OP(scan) == SPACE
3568d838 2985 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2986 : isSPACE_LC_utf8((U8*)locinput)))
2987 {
2988 sayNO;
2989 }
2990 locinput += PL_utf8skip[nextchr];
2991 nextchr = UCHARAT(locinput);
2992 break;
d6a28714 2993 }
ffc61ed2
JH
2994 if (!(OP(scan) == SPACE
2995 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2996 sayNO;
2997 nextchr = UCHARAT(++locinput);
2998 }
2999 else {
3000 if (!(OP(scan) == SPACE
3001 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
3002 sayNO;
3003 nextchr = UCHARAT(++locinput);
a0ed51b3 3004 }
a0ed51b3 3005 break;
d6a28714 3006 case NSPACEL:
3280af22 3007 PL_reg_flags |= RF_tainted;
bbce6d69 3008 /* FALL THROUGH */
d6a28714 3009 case NSPACE:
9442cb0e 3010 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3011 sayNO;
1aa99e6b 3012 if (do_utf8) {
8269fa76 3013 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 3014 if (OP(scan) == NSPACE
3568d838 3015 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3016 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3017 {
3018 sayNO;
3019 }
3020 locinput += PL_utf8skip[nextchr];
3021 nextchr = UCHARAT(locinput);
3022 break;
a0ed51b3 3023 }
ffc61ed2 3024 if (OP(scan) == NSPACE
d6a28714 3025 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3026 sayNO;
76e3520e 3027 nextchr = UCHARAT(++locinput);
a0d0e21e 3028 break;
d6a28714 3029 case DIGITL:
a0ed51b3
LW
3030 PL_reg_flags |= RF_tainted;
3031 /* FALL THROUGH */
d6a28714 3032 case DIGIT:
9442cb0e 3033 if (!nextchr)
a0ed51b3 3034 sayNO;
1aa99e6b 3035 if (do_utf8) {
8269fa76 3036 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 3037 if (!(OP(scan) == DIGIT
3568d838 3038 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3039 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3040 {
a0ed51b3 3041 sayNO;
dfe13c55 3042 }
6f06b55f 3043 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3044 nextchr = UCHARAT(locinput);
3045 break;
3046 }
ffc61ed2 3047 if (!(OP(scan) == DIGIT
9442cb0e 3048 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3049 sayNO;
3050 nextchr = UCHARAT(++locinput);
3051 break;
d6a28714 3052 case NDIGITL:
b8c5462f
JH
3053 PL_reg_flags |= RF_tainted;
3054 /* FALL THROUGH */
d6a28714 3055 case NDIGIT:
9442cb0e 3056 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3057 sayNO;
1aa99e6b 3058 if (do_utf8) {
8269fa76 3059 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 3060 if (OP(scan) == NDIGIT
3568d838 3061 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3062 : isDIGIT_LC_utf8((U8*)locinput))
3063 {
a0ed51b3 3064 sayNO;
9442cb0e 3065 }
6f06b55f 3066 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3067 nextchr = UCHARAT(locinput);
3068 break;
3069 }
ffc61ed2 3070 if (OP(scan) == NDIGIT
9442cb0e 3071 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3072 sayNO;
3073 nextchr = UCHARAT(++locinput);
3074 break;
3075 case CLUMP:
b7c83a7e 3076 if (locinput >= PL_regeol)
a0ed51b3 3077 sayNO;
b7c83a7e
JH
3078 if (do_utf8) {
3079 LOAD_UTF8_CHARCLASS(mark,"~");
3080 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3081 sayNO;
3082 locinput += PL_utf8skip[nextchr];
3083 while (locinput < PL_regeol &&
3084 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3085 locinput += UTF8SKIP(locinput);
3086 if (locinput > PL_regeol)
3087 sayNO;
eb08e2da
JH
3088 }
3089 else
3090 locinput++;
a0ed51b3
LW
3091 nextchr = UCHARAT(locinput);
3092 break;
c8756f30 3093 case REFFL:
3280af22 3094 PL_reg_flags |= RF_tainted;
c8756f30 3095 /* FALL THROUGH */
c277df42 3096 case REF:
c8756f30 3097 case REFF:
c277df42 3098 n = ARG(scan); /* which paren pair */
cf93c79d 3099 ln = PL_regstartp[n];
2c2d71f5 3100 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 3101 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 3102 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 3103 if (ln == PL_regendp[n])
a0d0e21e 3104 break;
a0ed51b3 3105
cf93c79d 3106 s = PL_bostr + ln;
1aa99e6b 3107 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3108 char *l = locinput;
cf93c79d 3109 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3110 /*
3111 * Note that we can't do the "other character" lookup trick as
3112 * in the 8-bit case (no pun intended) because in Unicode we
3113 * have to map both upper and title case to lower case.
3114 */
3115 if (OP(scan) == REFF) {
a2a2844f 3116 STRLEN ulen1, ulen2;
89ebb4a3
JH
3117 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3118 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
a0ed51b3
LW
3119 while (s < e) {
3120 if (l >= PL_regeol)
3121 sayNO;
a2a2844f
JH
3122 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3123 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3124 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3125 sayNO;
a2a2844f
JH
3126 s += ulen1;
3127 l += ulen2;
a0ed51b3
LW
3128 }
3129 }
3130 locinput = l;
3131 nextchr = UCHARAT(locinput);
3132 break;
3133 }
3134
a0d0e21e 3135 /* Inline the first character, for speed. */
76e3520e 3136 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3137 (OP(scan) == REF ||
3138 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3139 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3140 sayNO;
cf93c79d 3141 ln = PL_regendp[n] - ln;
3280af22 3142 if (locinput + ln > PL_regeol)
4633a7c4 3143 sayNO;
c8756f30
AK
3144 if (ln > 1 && (OP(scan) == REF
3145 ? memNE(s, locinput, ln)
3146 : (OP(scan) == REFF
3147 ? ibcmp(s, locinput, ln)
3148 : ibcmp_locale(s, locinput, ln))))
4633a7c4 3149 sayNO;
a0d0e21e 3150 locinput += ln;
76e3520e 3151 nextchr = UCHARAT(locinput);
a0d0e21e
LW
3152 break;
3153
3154 case NOTHING:
c277df42 3155 case TAIL:
a0d0e21e
LW
3156 break;
3157 case BACK:
3158 break;
c277df42
IZ
3159 case EVAL:
3160 {
3161 dSP;
533c011a 3162 OP_4tree *oop = PL_op;
3280af22 3163 COP *ocurcop = PL_curcop;
f3548bdc 3164 PAD *old_comppad;
c277df42 3165 SV *ret;
080c2dec 3166 struct regexp *oreg = PL_reg_re;
9041c2e3 3167
c277df42 3168 n = ARG(scan);
533c011a 3169 PL_op = (OP_4tree*)PL_regdata->data[n];
a3621e74 3170 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 3171 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 3172 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 3173
8e5e9ebe
RGS
3174 {
3175 SV **before = SP;
3176 CALLRUNOPS(aTHX); /* Scalar context. */
3177 SPAGAIN;
3178 if (SP == before)
075aa684 3179 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3180 else {
3181 ret = POPs;
3182 PUTBACK;
3183 }
3184 }
3185
0f5d15d6 3186 PL_op = oop;
f3548bdc 3187 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 3188 PL_curcop = ocurcop;
c277df42 3189 if (logical) {
0f5d15d6
IZ
3190 if (logical == 2) { /* Postponed subexpression. */
3191 regexp *re;
22c35a8c 3192 MAGIC *mg = Null(MAGIC*);
0f5d15d6 3193 re_cc_state state;
0f5d15d6 3194 CHECKPOINT cp, lastcp;
cb50f42d 3195 int toggleutf;
faf82a0b 3196 register SV *sv;
0f5d15d6 3197
faf82a0b
AE
3198 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3199 mg = mg_find(sv, PERL_MAGIC_qr);
3200 else if (SvSMAGICAL(ret)) {
3201 if (SvGMAGICAL(ret))
3202 sv_unmagic(ret, PERL_MAGIC_qr);
3203 else
3204 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3205 }
faf82a0b 3206
0f5d15d6
IZ
3207 if (mg) {
3208 re = (regexp *)mg->mg_obj;
df0003d4 3209 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3210 }
3211 else {
3212 STRLEN len;
3213 char *t = SvPV(ret, len);
3214 PMOP pm;
3215 char *oprecomp = PL_regprecomp;
3216 I32 osize = PL_regsize;
3217 I32 onpar = PL_regnpar;
3218
5fcd1c1b 3219 Zero(&pm, 1, PMOP);
cb50f42d 3220 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 3221 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 3222 if (!(SvFLAGS(ret)
faf82a0b
AE
3223 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3224 | SVs_GMG)))
14befaf4
DM
3225 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3226 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
3227 PL_regprecomp = oprecomp;
3228 PL_regsize = osize;
3229 PL_regnpar = onpar;
3230 }
a3621e74 3231 DEBUG_EXECUTE_r(
9041c2e3 3232 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
3233 "Entering embedded `%s%.60s%s%s'\n",
3234 PL_colors[0],
3235 re->precomp,
3236 PL_colors[1],
3237 (strlen(re->precomp) > 60 ? "..." : ""))
3238 );
3239 state.node = next;
3240 state.prev = PL_reg_call_cc;
3241 state.cc = PL_regcc;
3242 state.re = PL_reg_re;
3243
2ab05381 3244 PL_regcc = 0;
9041c2e3 3245
0f5d15d6 3246 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3247 REGCP_SET(lastcp);
0f5d15d6
IZ
3248 cache_re(re);
3249 state.ss = PL_savestack_ix;
3250 *PL_reglastparen = 0;
a01268b5 3251 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
3252 PL_reg_call_cc = &state;
3253 PL_reginput = locinput;
cb50f42d
YST
3254 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
3255 ((re->reganch & ROPT_UTF8) != 0);
3256 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3257
3258 /* XXXX This is too dramatic a measure... */
3259 PL_reg_maxiter = 0;
3260
0f5d15d6 3261 if (regmatch(re->program + 1)) {
2c914db6
IZ
3262 /* Even though we succeeded, we need to restore
3263 global variables, since we may be wrapped inside
3264 SUSPEND, thus the match may be not finished yet. */
3265
3266 /* XXXX Do this only if SUSPENDed? */
3267 PL_reg_call_cc = state.prev;
3268 PL_regcc = state.cc;
3269 PL_reg_re = state.re;
3270 cache_re(PL_reg_re);
cb50f42d 3271 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
3272
3273 /* XXXX This is too dramatic a measure... */
3274 PL_reg_maxiter = 0;
3275
3276 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
3277 ReREFCNT_dec(re);
3278 regcpblow(cp);
3279 sayYES;
3280 }
0f5d15d6 3281 ReREFCNT_dec(re);
02db2b7b 3282 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3283 regcppop();
3284 PL_reg_call_cc = state.prev;
3285 PL_regcc = state.cc;
3286 PL_reg_re = state.re;
d3790889 3287 cache_re(PL_reg_re);
cb50f42d 3288 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
3289
3290 /* XXXX This is too dramatic a measure... */
3291 PL_reg_maxiter = 0;
3292
8e514ae6 3293 logical = 0;
0f5d15d6
IZ
3294 sayNO;
3295 }
c277df42 3296 sw = SvTRUE(ret);
0f5d15d6 3297 logical = 0;
a0ed51b3 3298 }
080c2dec 3299 else {
3280af22 3300 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
3301 cache_re(oreg);
3302 }
c277df42
IZ
3303 break;
3304 }
a0d0e21e 3305 case OPEN:
c277df42 3306 n = ARG(scan); /* which paren pair */
3280af22
NIS
3307 PL_reg_start_tmp[n] = locinput;
3308 if (n > PL_regsize)
3309 PL_regsize = n;
a0d0e21e
LW
3310 break;
3311 case CLOSE:
c277df42 3312 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3313 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3314 PL_regendp[n] = locinput - PL_bostr;
eb160463 3315 if (n > (I32)*PL_reglastparen)
3280af22 3316 *PL_reglastparen = n;
a01268b5 3317 *PL_reglastcloseparen = n;
a0d0e21e 3318 break;
c277df42
IZ
3319 case GROUPP:
3320 n = ARG(scan); /* which paren pair */
eb160463 3321 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3322 break;
3323 case IFTHEN:
2c2d71f5 3324 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3325 if (sw)
3326 next = NEXTOPER(NEXTOPER(scan));
3327 else {
3328 next = scan + ARG(scan);
3329 if (OP(next) == IFTHEN) /* Fake one. */
3330 next = NEXTOPER(NEXTOPER(next));
3331 }
3332 break;
3333 case LOGICAL:
0f5d15d6 3334 logical = scan->flags;
c277df42 3335 break;
2ab05381
IZ
3336/*******************************************************************
3337 PL_regcc contains infoblock about the innermost (...)* loop, and
3338 a pointer to the next outer infoblock.
3339
3340 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3341
3342 1) After matching X, regnode for CURLYX is processed;
3343
9041c2e3 3344 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3345 regmatch() recursively with the starting point at WHILEM node;
3346
3347 3) Each hit of WHILEM node tries to match A and Z (in the order
3348 depending on the current iteration, min/max of {min,max} and
3349 greediness). The information about where are nodes for "A"
3350 and "Z" is read from the infoblock, as is info on how many times "A"
3351 was already matched, and greediness.
3352
3353 4) After A matches, the same WHILEM node is hit again.
3354
3355 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3356 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3357 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3358 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3359 of the external loop.
3360
3361 Currently present infoblocks form a tree with a stem formed by PL_curcc
3362 and whatever it mentions via ->next, and additional attached trees
3363 corresponding to temporarily unset infoblocks as in "5" above.
3364
9041c2e3 3365 In the following picture infoblocks for outer loop of
2ab05381
IZ
3366 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3367 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3368 infoblocks are drawn below the "reset" infoblock.
3369
3370 In fact in the picture below we do not show failed matches for Z and T
3371 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3372 more obvious *why* one needs to *temporary* unset infoblocks.]
3373
3374 Matched REx position InfoBlocks Comment
3375 (Y(A)*?Z)*?T x
3376 Y(A)*?Z)*?T x <- O
3377 Y (A)*?Z)*?T x <- O
3378 Y A)*?Z)*?T x <- O <- I
3379 YA )*?Z)*?T x <- O <- I
3380 YA A)*?Z)*?T x <- O <- I
3381 YAA )*?Z)*?T x <- O <- I
3382 YAA Z)*?T x <- O # Temporary unset I
3383 I
3384
3385 YAAZ Y(A)*?Z)*?T x <- O
3386 I
3387
3388 YAAZY (A)*?Z)*?T x <- O
3389 I
3390
3391 YAAZY A)*?Z)*?T x <- O <- I
3392 I
3393
3394 YAAZYA )*?Z)*?T x <- O <- I
3395 I
3396
3397 YAAZYA Z)*?T x <- O # Temporary unset I
3398 I,I
3399
3400 YAAZYAZ )*?T x <- O
3401 I,I
3402
3403 YAAZYAZ T x # Temporary unset O
3404 O
3405 I,I
3406
3407 YAAZYAZT x
3408 O
3409 I,I
3410 *******************************************************************/
a0d0e21e
LW
3411 case CURLYX: {
3412 CURCUR cc;
3280af22 3413 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3414 /* No need to save/restore up to this paren */
3415 I32 parenfloor = scan->flags;
c277df42
IZ
3416
3417 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3418 next += ARG(next);
3280af22
NIS
3419 cc.oldcc = PL_regcc;
3420 PL_regcc = &cc;
cb434fcc
IZ
3421 /* XXXX Probably it is better to teach regpush to support
3422 parenfloor > PL_regsize... */
eb160463 3423 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3424 parenfloor = *PL_reglastparen; /* Pessimization... */
3425 cc.parenfloor = parenfloor;
a0d0e21e
LW
3426 cc.cur = -1;
3427 cc.min = ARG1(scan);
3428 cc.max = ARG2(scan);
c277df42 3429 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3430 cc.next = next;
3431 cc.minmod = minmod;
3432 cc.lastloc = 0;
3280af22 3433 PL_reginput = locinput;
a0d0e21e
LW
3434 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3435 regcpblow(cp);
3280af22 3436 PL_regcc = cc.oldcc;
4633a7c4 3437 saySAME(n);
a0d0e21e
LW
3438 }
3439 /* NOT REACHED */
3440 case WHILEM: {
3441 /*
3442 * This is really hard to understand, because after we match
3443 * what we're trying to match, we must make sure the rest of
2c2d71f5 3444 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3445 * to go back UP the parse tree by recursing ever deeper. And
3446 * if it fails, we have to reset our parent's current state
3447 * that we can try again after backing off.
3448 */
3449
c277df42 3450 CHECKPOINT cp, lastcp;
3280af22 3451 CURCUR* cc = PL_regcc;
c277df42
IZ
3452 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3453
4633a7c4 3454 n = cc->cur + 1; /* how many we know we matched */
3280af22 3455 PL_reginput = locinput;
a0d0e21e 3456
a3621e74 3457 DEBUG_EXECUTE_r(
9041c2e3 3458 PerlIO_printf(Perl_debug_log,
91f3b821 3459 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3460 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3461 (long)n, (long)cc->min,
2797576d 3462 (long)cc->max, PTR2UV(cc))
c277df42 3463 );
4633a7c4 3464
a0d0e21e
LW
3465 /* If degenerate scan matches "", assume scan done. */
3466
579cf2c3 3467 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3468 PL_regcc = cc->oldcc;
2ab05381
IZ
3469 if (PL_regcc)
3470 ln = PL_regcc->cur;
a3621e74 3471 DEBUG_EXECUTE_r(
c3464db5
DD
3472 PerlIO_printf(Perl_debug_log,
3473 "%*s empty match detected, try continuation...\n",
3280af22 3474 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3475 );
a0d0e21e 3476 if (regmatch(cc->next))
4633a7c4 3477 sayYES;
2ab05381
IZ
3478 if (PL_regcc)
3479 PL_regcc->cur = ln;
3280af22 3480 PL_regcc = cc;
4633a7c4 3481 sayNO;
a0d0e21e
LW
3482 }
3483
3484 /* First just match a string of min scans. */
3485
3486 if (n < cc->min) {
3487 cc->cur = n;
3488 cc->lastloc = locinput;
4633a7c4
LW
3489 if (regmatch(cc->scan))
3490 sayYES;
3491 cc->cur = n - 1;
c277df42 3492 cc->lastloc = lastloc;
4633a7c4 3493 sayNO;
a0d0e21e
LW
3494 }
3495
2c2d71f5
JH
3496 if (scan->flags) {
3497 /* Check whether we already were at this position.
3498 Postpone detection until we know the match is not
3499 *that* much linear. */
3500 if (!PL_reg_maxiter) {
3501 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3502 PL_reg_leftiter = PL_reg_maxiter;
3503 }
3504 if (PL_reg_leftiter-- == 0) {
69a95ace 3505 I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 3506 if (PL_reg_poscache) {
eb160463 3507 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3508 Renew(PL_reg_poscache, size, char);
3509 PL_reg_poscache_size = size;
3510 }
3511 Zero(PL_reg_poscache, size, char);
3512 }
3513 else {
3514 PL_reg_poscache_size = size;
3515 Newz(29, PL_reg_poscache, size, char);
3516 }
a3621e74 3517 DEBUG_EXECUTE_r(
2c2d71f5
JH
3518 PerlIO_printf(Perl_debug_log,
3519 "%sDetected a super-linear match, switching on caching%s...\n",
3520 PL_colors[4], PL_colors[5])
3521 );
3522 }
3523 if (PL_reg_leftiter < 0) {
3524 I32 o = locinput - PL_bostr, b;
3525
69a95ace 3526 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
2c2d71f5
JH
3527 b = o % 8;
3528 o /= 8;
3529 if (PL_reg_poscache[o] & (1<<b)) {
a3621e74 3530 DEBUG_EXECUTE_r(
2c2d71f5
JH
3531 PerlIO_printf(Perl_debug_log,
3532 "%*s already tried at this position...\n",
3533 REPORT_CODE_OFF+PL_regindent*2, "")
3534 );
69a95ace 3535 if (PL_reg_flags & RF_false)
c2b0868c
HS
3536 sayYES;
3537 else
3538 sayNO_SILENT;
2c2d71f5
JH
3539 }
3540 PL_reg_poscache[o] |= (1<<b);
3541 }
3542 }
3543
a0d0e21e
LW
3544 /* Prefer next over scan for minimal matching. */
3545
3546 if (cc->minmod) {
3280af22 3547 PL_regcc = cc->oldcc;
2ab05381
IZ
3548 if (PL_regcc)
3549 ln = PL_regcc->cur;
5f05dabc 3550 cp = regcppush(cc->parenfloor);
02db2b7b 3551 REGCP_SET(lastcp);
5f05dabc