This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Revert change #22472, but keep the new tests.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
4bb101f2
JH
70 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
71 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 72 ****
9ef589d8
LW
73 **** You may distribute under the terms of either the GNU General Public
74 **** License or the Artistic License, as specified in the README file.
a687059c
LW
75 *
76 * Beware that some of this code is subtly aware of the way operator
77 * precedence is structured in regular expressions. Serious changes in
78 * regular-expression syntax might require a total rethink.
79 */
80#include "EXTERN.h"
864dbfa3 81#define PERL_IN_REGEXEC_C
a687059c 82#include "perl.h"
0f5d15d6 83
a687059c
LW
84#include "regcomp.h"
85
c277df42
IZ
86#define RF_tainted 1 /* tainted information used? */
87#define RF_warned 2 /* warned about big count? */
ce862d02 88#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3 89#define RF_utf8 8 /* String contains multibyte chars? */
c2b0868c 90#define RF_false 16 /* odd number of nested negatives */
a0ed51b3 91
eb160463 92#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
93
94#define RS_init 1 /* eval environment created */
95#define RS_set 2 /* replsv value is set */
c277df42 96
a687059c
LW
97#ifndef STATIC
98#define STATIC static
99#endif
100
ba7b4546 101#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
33b8afdf 107#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
dfe13c55
GS
110#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
111#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
112#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
113#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
114#define HOPc(pos,off) ((char*)HOP(pos,off))
115#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 116
efb30f32 117#define HOPBACK(pos, off) ( \
e54858b0 118 (PL_reg_match_utf8) \
efb30f32
HS
119 ? reghopmaybe((U8*)pos, -off) \
120 : (pos - off >= PL_bostr) \
121 ? (U8*)(pos - off) \
122 : (U8*)NULL \
123)
124#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
125
1aa99e6b
IH
126#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
127#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
128#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
129#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
130#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
131#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
132
977e4b84 133#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 134
5f80c4cf 135/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
136#define JUMPABLE(rn) ( \
137 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
138 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
139 OP(rn) == PLUS || OP(rn) == MINMOD || \
140 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
141)
142
cca55fe3
JP
143#define HAS_TEXT(rn) ( \
144 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
145)
e2d8ce26 146
a84d97b6
HS
147/*
148 Search for mandatory following text node; for lookahead, the text must
149 follow but for lookbehind (rn->flags != 0) we skip to the next step.
150*/
cca55fe3 151#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 152 while (JUMPABLE(rn)) \
a84d97b6 153 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 154 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
155 else if (OP(rn) == PLUS) \
156 rn = NEXTOPER(rn); \
a84d97b6
HS
157 else if (OP(rn) == IFMATCH) \
158 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 159 else rn += NEXT_OFF(rn); \
5f80c4cf 160} STMT_END
74750237 161
acfe0abc 162static void restore_pos(pTHX_ void *arg);
51371543 163
76e3520e 164STATIC CHECKPOINT
cea2e8a9 165S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 166{
3280af22 167 int retval = PL_savestack_ix;
b1ce53c5
JH
168#define REGCP_PAREN_ELEMS 4
169 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
170 int p;
171
e49a9654
IH
172 if (paren_elems_to_push < 0)
173 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
174
a01268b5 175#define REGCP_OTHER_ELEMS 6
4b3c1a47 176 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 177 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 178/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
179 SSPUSHINT(PL_regendp[p]);
180 SSPUSHINT(PL_regstartp[p]);
3280af22 181 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
182 SSPUSHINT(p);
183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
02db2b7b 199# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
02db2b7b 203# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
cea2e8a9 209S_regcppop(pTHX)
a0d0e21e 210{
b1ce53c5 211 I32 i;
a0d0e21e
LW
212 U32 paren = 0;
213 char *input;
cf93c79d 214 I32 tmps;
b1ce53c5
JH
215
216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 217 i = SSPOPINT;
b1ce53c5
JH
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 220 input = (char *) SSPOPPTR;
a01268b5 221 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
b1ce53c5
JH
224
225 /* Now restore the parentheses context. */
41123dfd
JH
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 228 paren = (U32)SSPOPINT;
3280af22 229 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
230 PL_regstartp[paren] = SSPOPINT;
231 tmps = SSPOPINT;
3280af22
NIS
232 if (paren <= *PL_reglastparen)
233 PL_regendp[paren] = tmps;
c277df42 234 DEBUG_r(
c3464db5 235 PerlIO_printf(Perl_debug_log,
b900a521 236 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 237 (UV)paren, (IV)PL_regstartp[paren],
b900a521 238 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 239 (IV)PL_regendp[paren],
3280af22 240 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 241 );
a0d0e21e 242 }
c277df42 243 DEBUG_r(
eb160463 244 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 245 PerlIO_printf(Perl_debug_log,
faccc32b
JH
246 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
247 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
248 }
249 );
daf18116 250#if 1
dafc8851
JH
251 /* It would seem that the similar code in regtry()
252 * already takes care of this, and in fact it is in
253 * a better location to since this code can #if 0-ed out
254 * but the code in regtry() is needed or otherwise tests
255 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
256 * (as of patchlevel 7877) will fail. Then again,
257 * this code seems to be necessary or otherwise
258 * building DynaLoader will fail:
259 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
260 * --jhi */
eb160463
GS
261 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
262 if ((I32)paren > PL_regsize)
cf93c79d
IZ
263 PL_regstartp[paren] = -1;
264 PL_regendp[paren] = -1;
a0d0e21e 265 }
dafc8851 266#endif
a0d0e21e
LW
267 return input;
268}
269
0f5d15d6 270STATIC char *
cea2e8a9 271S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
272{
273 I32 tmp = PL_savestack_ix;
274
275 PL_savestack_ix = ss;
276 regcppop();
277 PL_savestack_ix = tmp;
942e002e 278 return Nullch;
0f5d15d6
IZ
279}
280
281typedef struct re_cc_state
282{
283 I32 ss;
284 regnode *node;
285 struct re_cc_state *prev;
286 CURCUR *cc;
287 regexp *re;
288} re_cc_state;
289
02db2b7b 290#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 291
29d1e993
HS
292#define TRYPAREN(paren, n, input) { \
293 if (paren) { \
294 if (n) { \
295 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
296 PL_regendp[paren] = input - PL_bostr; \
297 } \
298 else \
299 PL_regendp[paren] = -1; \
300 } \
301 if (regmatch(next)) \
302 sayYES; \
303 if (paren && n) \
304 PL_regendp[paren] = -1; \
305}
306
307
a687059c 308/*
e50aee73 309 * pregexec and friends
a687059c
LW
310 */
311
312/*
c277df42 313 - pregexec - match a regexp against a string
a687059c 314 */
c277df42 315I32
864dbfa3 316Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 317 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
318/* strend: pointer to null at end of string */
319/* strbeg: real beginning of string */
320/* minend: end of match must be >=minend after stringarg. */
321/* nosave: For optimizations. */
322{
323 return
9041c2e3 324 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
325 nosave ? 0 : REXEC_COPY_STR);
326}
0f5d15d6
IZ
327
328STATIC void
cea2e8a9 329S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
330{
331 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
332#ifdef DEBUGGING
333 PL_regprogram = prog->program;
334#endif
335 PL_regnpar = prog->nparens;
9041c2e3
NIS
336 PL_regdata = prog->data;
337 PL_reg_re = prog;
0f5d15d6 338}
22e551b9 339
9041c2e3 340/*
cad2e5aa
JH
341 * Need to implement the following flags for reg_anch:
342 *
343 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
344 * USE_INTUIT_ML
345 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
346 * INTUIT_AUTORITATIVE_ML
347 * INTUIT_ONCE_NOML - Intuit can match in one location only.
348 * INTUIT_ONCE_ML
349 *
350 * Another flag for this function: SECOND_TIME (so that float substrs
351 * with giant delta may be not rechecked).
352 */
353
354/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
355
2c2d71f5 356/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
357 Otherwise, only SvCUR(sv) is used to get strbeg. */
358
359/* XXXX We assume that strpos is strbeg unless sv. */
360
6eb5f6b9
JH
361/* XXXX Some places assume that there is a fixed substring.
362 An update may be needed if optimizer marks as "INTUITable"
363 RExen without fixed substrings. Similarly, it is assumed that
364 lengths of all the strings are no more than minlen, thus they
365 cannot come from lookahead.
366 (Or minlen should take into account lookahead.) */
367
2c2d71f5
JH
368/* A failure to find a constant substring means that there is no need to make
369 an expensive call to REx engine, thus we celebrate a failure. Similarly,
370 finding a substring too deep into the string means that less calls to
30944b6d
IZ
371 regtry() should be needed.
372
373 REx compiler's optimizer found 4 possible hints:
374 a) Anchored substring;
375 b) Fixed substring;
376 c) Whether we are anchored (beginning-of-line or \G);
377 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 378 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
379 string which does not contradict any of them.
380 */
2c2d71f5 381
6eb5f6b9
JH
382/* Most of decisions we do here should have been done at compile time.
383 The nodes of the REx which we used for the search should have been
384 deleted from the finite automaton. */
385
cad2e5aa
JH
386char *
387Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
388 char *strend, U32 flags, re_scream_pos_data *data)
389{
b7953727 390 register I32 start_shift = 0;
cad2e5aa 391 /* Should be nonnegative! */
b7953727 392 register I32 end_shift = 0;
2c2d71f5
JH
393 register char *s;
394 register SV *check;
a1933d95 395 char *strbeg;
cad2e5aa 396 char *t;
33b8afdf 397 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 398 I32 ml_anch;
6eb5f6b9 399 register char *other_last = Nullch; /* other substr checked before this */
b7953727 400 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
401#ifdef DEBUGGING
402 char *i_strpos = strpos;
ce333219 403 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 404#endif
a30b2f1f 405 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 406
b8d68ded
JH
407 if (prog->reganch & ROPT_UTF8) {
408 DEBUG_r(PerlIO_printf(Perl_debug_log,
409 "UTF-8 regex...\n"));
410 PL_reg_flags |= RF_utf8;
411 }
412
2a782b5b 413 DEBUG_r({
b8d68ded 414 char *s = PL_reg_match_utf8 ?
c728cb41
JH
415 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
416 strpos;
b8d68ded
JH
417 int len = PL_reg_match_utf8 ?
418 strlen(s) : strend - strpos;
2a782b5b
JH
419 if (!PL_colorset)
420 reginitcolors();
b8d68ded
JH
421 if (PL_reg_match_utf8)
422 DEBUG_r(PerlIO_printf(Perl_debug_log,
423 "UTF-8 target...\n"));
2a782b5b
JH
424 PerlIO_printf(Perl_debug_log,
425 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
426 PL_colors[4],PL_colors[5],PL_colors[0],
427 prog->precomp,
428 PL_colors[1],
429 (strlen(prog->precomp) > 60 ? "..." : ""),
430 PL_colors[0],
431 (int)(len > 60 ? 60 : len),
432 s, PL_colors[1],
433 (len > 60 ? "..." : "")
434 );
435 });
cad2e5aa 436
c344f387
JH
437 /* CHR_DIST() would be more correct here but it makes things slow. */
438 if (prog->minlen > strend - strpos) {
a72c7584
JH
439 DEBUG_r(PerlIO_printf(Perl_debug_log,
440 "String too short... [re_intuit_start]\n"));
cad2e5aa 441 goto fail;
2c2d71f5 442 }
a1933d95 443 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 444 PL_regeol = strend;
33b8afdf
JH
445 if (do_utf8) {
446 if (!prog->check_utf8 && prog->check_substr)
447 to_utf8_substr(prog);
448 check = prog->check_utf8;
449 } else {
450 if (!prog->check_substr && prog->check_utf8)
451 to_byte_substr(prog);
452 check = prog->check_substr;
453 }
454 if (check == &PL_sv_undef) {
455 DEBUG_r(PerlIO_printf(Perl_debug_log,
456 "Non-utf string cannot match utf check string\n"));
457 goto fail;
458 }
2c2d71f5 459 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
460 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
461 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 462 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 463
7e25d62c
JH
464 if (!ml_anch) {
465 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
466 | ROPT_IMPLICIT)) /* not a real BOL */
467 /* SvCUR is not set on references: SvRV and SvPVX overlap */
468 && sv && !SvROK(sv)
469 && (strpos != strbeg)) {
470 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
471 goto fail;
472 }
473 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 474 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 475 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
476 I32 slen;
477
1aa99e6b 478 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
479 if (SvTAIL(check)) {
480 slen = SvCUR(check); /* >= 1 */
cad2e5aa 481
9041c2e3 482 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
483 || (strend - s == slen && strend[-1] != '\n')) {
484 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
485 goto fail_finish;
cad2e5aa
JH
486 }
487 /* Now should match s[0..slen-2] */
488 slen--;
653099ff 489 if (slen && (*SvPVX(check) != *s
cad2e5aa 490 || (slen > 1
653099ff 491 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
492 report_neq:
493 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
494 goto fail_finish;
495 }
cad2e5aa 496 }
653099ff
GS
497 else if (*SvPVX(check) != *s
498 || ((slen = SvCUR(check)) > 1
499 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
500 goto report_neq;
501 goto success_at_start;
7e25d62c 502 }
cad2e5aa 503 }
2c2d71f5 504 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 505 s = strpos;
2c2d71f5 506 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 507 end_shift = prog->minlen - start_shift -
653099ff 508 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 509 if (!ml_anch) {
653099ff
GS
510 I32 end = prog->check_offset_max + CHR_SVLEN(check)
511 - (SvTAIL(check) != 0);
1aa99e6b 512 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
513
514 if (end_shift < eshift)
515 end_shift = eshift;
516 }
cad2e5aa 517 }
2c2d71f5 518 else { /* Can match at random position */
cad2e5aa
JH
519 ml_anch = 0;
520 s = strpos;
2c2d71f5
JH
521 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
522 /* Should be nonnegative! */
523 end_shift = prog->minlen - start_shift -
653099ff 524 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
525 }
526
2c2d71f5 527#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 528 if (end_shift < 0)
6bbae5e6 529 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
530#endif
531
2c2d71f5
JH
532 restart:
533 /* Find a possible match in the region s..strend by looking for
534 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 535 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
536 I32 p = -1; /* Internal iterator of scream. */
537 I32 *pp = data ? data->scream_pos : &p;
538
2c2d71f5
JH
539 if (PL_screamfirst[BmRARE(check)] >= 0
540 || ( BmRARE(check) == '\n'
541 && (BmPREVIOUS(check) == SvCUR(check) - 1)
542 && SvTAIL(check) ))
9041c2e3 543 s = screaminstr(sv, check,
2c2d71f5 544 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 545 else
2c2d71f5 546 goto fail_finish;
4addbd3b
HS
547 /* we may be pointing at the wrong string */
548 if (s && RX_MATCH_COPIED(prog))
7ef91622 549 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
550 if (data)
551 *data->scream_olds = s;
552 }
f33976b4 553 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
554 s = fbm_instr((U8*)(s + start_shift),
555 (U8*)(strend - end_shift),
556 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 557 else
1aa99e6b
IH
558 s = fbm_instr(HOP3(s, start_shift, strend),
559 HOP3(strend, -end_shift, strbeg),
2c2d71f5 560 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
561
562 /* Update the count-of-usability, remove useless subpatterns,
563 unshift s. */
2c2d71f5
JH
564
565 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
566 (s ? "Found" : "Did not find"),
33b8afdf 567 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 568 PL_colors[0],
7b0972df
JH
569 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
570 SvPVX(check),
2c2d71f5
JH
571 PL_colors[1], (SvTAIL(check) ? "$" : ""),
572 (s ? " at offset " : "...\n") ) );
573
574 if (!s)
575 goto fail_finish;
576
6eb5f6b9
JH
577 check_at = s;
578
2c2d71f5 579 /* Finish the diagnostic message */
30944b6d 580 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
581
582 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
583 Start with the other substr.
584 XXXX no SCREAM optimization yet - and a very coarse implementation
585 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
586 *always* match. Probably should be marked during compile...
587 Probably it is right to do no SCREAM here...
588 */
589
33b8afdf 590 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 591 /* Take into account the "other" substring. */
2c2d71f5
JH
592 /* XXXX May be hopelessly wrong for UTF... */
593 if (!other_last)
6eb5f6b9 594 other_last = strpos;
33b8afdf 595 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
596 do_other_anchored:
597 {
1aa99e6b 598 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 599 char *s1 = s;
33b8afdf 600 SV* must;
2c2d71f5 601
2c2d71f5
JH
602 t = s - prog->check_offset_max;
603 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 604 && (!do_utf8
1aa99e6b 605 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 606 && t > strpos)))
30944b6d 607 /* EMPTY */;
2c2d71f5
JH
608 else
609 t = strpos;
1aa99e6b 610 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
611 if (t < other_last) /* These positions already checked */
612 t = other_last;
1aa99e6b 613 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
614 if (last < last1)
615 last1 = last;
616 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
617 /* On end-of-str: see comment below. */
33b8afdf
JH
618 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
619 if (must == &PL_sv_undef) {
620 s = (char*)NULL;
621 DEBUG_r(must = prog->anchored_utf8); /* for debug */
622 }
623 else
624 s = fbm_instr(
625 (unsigned char*)t,
626 HOP3(HOP3(last1, prog->anchored_offset, strend)
627 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
628 must,
629 PL_multiline ? FBMrf_MULTILINE : 0
630 );
1aa99e6b
IH
631 DEBUG_r(PerlIO_printf(Perl_debug_log,
632 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
633 (s ? "Found" : "Contradicts"),
634 PL_colors[0],
33b8afdf
JH
635 (int)(SvCUR(must)
636 - (SvTAIL(must)!=0)),
637 SvPVX(must),
638 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
639 if (!s) {
640 if (last1 >= last2) {
641 DEBUG_r(PerlIO_printf(Perl_debug_log,
642 ", giving up...\n"));
643 goto fail_finish;
644 }
645 DEBUG_r(PerlIO_printf(Perl_debug_log,
646 ", trying floating at offset %ld...\n",
1aa99e6b
IH
647 (long)(HOP3c(s1, 1, strend) - i_strpos)));
648 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
649 s = HOP3c(last, 1, strend);
2c2d71f5
JH
650 goto restart;
651 }
652 else {
653 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 654 (long)(s - i_strpos)));
1aa99e6b
IH
655 t = HOP3c(s, -prog->anchored_offset, strbeg);
656 other_last = HOP3c(s, 1, strend);
30944b6d 657 s = s1;
2c2d71f5
JH
658 if (t == strpos)
659 goto try_at_start;
2c2d71f5
JH
660 goto try_at_offset;
661 }
30944b6d 662 }
2c2d71f5
JH
663 }
664 else { /* Take into account the floating substring. */
33b8afdf
JH
665 char *last, *last1;
666 char *s1 = s;
667 SV* must;
668
669 t = HOP3c(s, -start_shift, strbeg);
670 last1 = last =
671 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
672 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
673 last = HOP3c(t, prog->float_max_offset, strend);
674 s = HOP3c(t, prog->float_min_offset, strend);
675 if (s < other_last)
676 s = other_last;
2c2d71f5 677 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
678 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
679 /* fbm_instr() takes into account exact value of end-of-str
680 if the check is SvTAIL(ed). Since false positives are OK,
681 and end-of-str is not later than strend we are OK. */
682 if (must == &PL_sv_undef) {
683 s = (char*)NULL;
684 DEBUG_r(must = prog->float_utf8); /* for debug message */
685 }
686 else
2c2d71f5 687 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
688 (unsigned char*)last + SvCUR(must)
689 - (SvTAIL(must)!=0),
690 must, PL_multiline ? FBMrf_MULTILINE : 0);
691 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
692 (s ? "Found" : "Contradicts"),
693 PL_colors[0],
694 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
695 SvPVX(must),
696 PL_colors[1], (SvTAIL(must) ? "$" : "")));
697 if (!s) {
698 if (last1 == last) {
2c2d71f5 699 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
700 ", giving up...\n"));
701 goto fail_finish;
2c2d71f5 702 }
33b8afdf
JH
703 DEBUG_r(PerlIO_printf(Perl_debug_log,
704 ", trying anchored starting at offset %ld...\n",
705 (long)(s1 + 1 - i_strpos)));
706 other_last = last;
707 s = HOP3c(t, 1, strend);
708 goto restart;
709 }
710 else {
711 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
712 (long)(s - i_strpos)));
713 other_last = s; /* Fix this later. --Hugo */
714 s = s1;
715 if (t == strpos)
716 goto try_at_start;
717 goto try_at_offset;
718 }
2c2d71f5 719 }
cad2e5aa 720 }
2c2d71f5
JH
721
722 t = s - prog->check_offset_max;
2c2d71f5 723 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 724 && (!do_utf8
1aa99e6b
IH
725 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
726 && t > strpos))) {
2c2d71f5
JH
727 /* Fixed substring is found far enough so that the match
728 cannot start at strpos. */
729 try_at_offset:
cad2e5aa 730 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
731 /* Eventually fbm_*() should handle this, but often
732 anchored_offset is not 0, so this check will not be wasted. */
733 /* XXXX In the code below we prefer to look for "^" even in
734 presence of anchored substrings. And we search even
735 beyond the found float position. These pessimizations
736 are historical artefacts only. */
737 find_anchor:
2c2d71f5 738 while (t < strend - prog->minlen) {
cad2e5aa 739 if (*t == '\n') {
4ee3650e 740 if (t < check_at - prog->check_offset_min) {
33b8afdf 741 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
742 /* Since we moved from the found position,
743 we definitely contradict the found anchored
30944b6d
IZ
744 substr. Due to the above check we do not
745 contradict "check" substr.
746 Thus we can arrive here only if check substr
747 is float. Redo checking for "other"=="fixed".
748 */
9041c2e3 749 strpos = t + 1;
30944b6d
IZ
750 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
751 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
752 goto do_other_anchored;
753 }
4ee3650e
GS
754 /* We don't contradict the found floating substring. */
755 /* XXXX Why not check for STCLASS? */
cad2e5aa 756 s = t + 1;
2c2d71f5 757 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 758 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
759 goto set_useful;
760 }
4ee3650e
GS
761 /* Position contradicts check-string */
762 /* XXXX probably better to look for check-string
763 than for "\n", so one should lower the limit for t? */
764 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 765 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 766 other_last = strpos = s = t + 1;
cad2e5aa
JH
767 goto restart;
768 }
769 t++;
770 }
2c2d71f5
JH
771 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
772 PL_colors[0],PL_colors[1]));
773 goto fail_finish;
cad2e5aa 774 }
f5952150
GS
775 else {
776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
777 PL_colors[0],PL_colors[1]));
778 }
cad2e5aa
JH
779 s = t;
780 set_useful:
33b8afdf 781 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
782 }
783 else {
f5952150 784 /* The found string does not prohibit matching at strpos,
2c2d71f5 785 - no optimization of calling REx engine can be performed,
f5952150
GS
786 unless it was an MBOL and we are not after MBOL,
787 or a future STCLASS check will fail this. */
2c2d71f5
JH
788 try_at_start:
789 /* Even in this situation we may use MBOL flag if strpos is offset
790 wrt the start of the string. */
05b4157f 791 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 792 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
793 /* May be due to an implicit anchor of m{.*foo} */
794 && !(prog->reganch & ROPT_IMPLICIT))
795 {
cad2e5aa
JH
796 t = strpos;
797 goto find_anchor;
798 }
30944b6d 799 DEBUG_r( if (ml_anch)
f5952150
GS
800 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
801 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 802 );
2c2d71f5 803 success_at_start:
30944b6d 804 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
805 && (do_utf8 ? (
806 prog->check_utf8 /* Could be deleted already */
807 && --BmUSEFUL(prog->check_utf8) < 0
808 && (prog->check_utf8 == prog->float_utf8)
809 ) : (
810 prog->check_substr /* Could be deleted already */
811 && --BmUSEFUL(prog->check_substr) < 0
812 && (prog->check_substr == prog->float_substr)
813 )))
66e933ab 814 {
cad2e5aa 815 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 816 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
817 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
818 if (do_utf8 ? prog->check_substr : prog->check_utf8)
819 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
820 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
821 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 822 check = Nullsv; /* abort */
cad2e5aa 823 s = strpos;
3cf5c195
IZ
824 /* XXXX This is a remnant of the old implementation. It
825 looks wasteful, since now INTUIT can use many
6eb5f6b9 826 other heuristics. */
cad2e5aa
JH
827 prog->reganch &= ~RE_USE_INTUIT;
828 }
829 else
830 s = strpos;
831 }
832
6eb5f6b9
JH
833 /* Last resort... */
834 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
835 if (prog->regstclass) {
836 /* minlen == 0 is possible if regstclass is \b or \B,
837 and the fixed substr is ''$.
838 Since minlen is already taken into account, s+1 is before strend;
839 accidentally, minlen >= 1 guaranties no false positives at s + 1
840 even for \b or \B. But (minlen? 1 : 0) below assumes that
841 regstclass does not come from lookahead... */
842 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
843 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 844 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 845 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 846 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 847 : 1);
33b8afdf 848 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 849 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 850 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
851 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
852 cl_l, strend)
853 : strend);
a1933d95 854 char *startpos = strbeg;
6eb5f6b9
JH
855
856 t = s;
9926ca43 857 cache_re(prog);
f33976b4 858 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
859 if (!s) {
860#ifdef DEBUGGING
b7953727 861 char *what = 0;
6eb5f6b9
JH
862#endif
863 if (endpos == strend) {
864 DEBUG_r( PerlIO_printf(Perl_debug_log,
865 "Could not match STCLASS...\n") );
866 goto fail;
867 }
66e933ab
GS
868 DEBUG_r( PerlIO_printf(Perl_debug_log,
869 "This position contradicts STCLASS...\n") );
653099ff
GS
870 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
871 goto fail;
6eb5f6b9 872 /* Contradict one of substrings */
33b8afdf
JH
873 if (prog->anchored_substr || prog->anchored_utf8) {
874 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
875 DEBUG_r( what = "anchored" );
876 hop_and_restart:
1aa99e6b 877 s = HOP3c(t, 1, strend);
66e933ab
GS
878 if (s + start_shift + end_shift > strend) {
879 /* XXXX Should be taken into account earlier? */
880 DEBUG_r( PerlIO_printf(Perl_debug_log,
881 "Could not match STCLASS...\n") );
882 goto fail;
883 }
5e39e1e5
HS
884 if (!check)
885 goto giveup;
6eb5f6b9 886 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 887 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
888 what, (long)(s + start_shift - i_strpos)) );
889 goto restart;
890 }
66e933ab 891 /* Have both, check_string is floating */
6eb5f6b9
JH
892 if (t + start_shift >= check_at) /* Contradicts floating=check */
893 goto retry_floating_check;
894 /* Recheck anchored substring, but not floating... */
9041c2e3 895 s = check_at;
5e39e1e5
HS
896 if (!check)
897 goto giveup;
6eb5f6b9 898 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 899 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
900 (long)(other_last - i_strpos)) );
901 goto do_other_anchored;
902 }
60e71179
GS
903 /* Another way we could have checked stclass at the
904 current position only: */
905 if (ml_anch) {
906 s = t = t + 1;
5e39e1e5
HS
907 if (!check)
908 goto giveup;
60e71179 909 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
910 "Looking for /%s^%s/m starting at offset %ld...\n",
911 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 912 goto try_at_offset;
66e933ab 913 }
33b8afdf 914 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 915 goto fail;
6eb5f6b9
JH
916 /* Check is floating subtring. */
917 retry_floating_check:
918 t = check_at - start_shift;
919 DEBUG_r( what = "floating" );
920 goto hop_and_restart;
921 }
b7953727
JH
922 if (t != s) {
923 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 924 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
925 (long)(t - i_strpos), (long)(s - i_strpos))
926 );
927 }
928 else {
929 DEBUG_r(PerlIO_printf(Perl_debug_log,
930 "Does not contradict STCLASS...\n");
931 );
932 }
6eb5f6b9 933 }
5e39e1e5
HS
934 giveup:
935 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
936 PL_colors[4], (check ? "Guessed" : "Giving up"),
937 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 938 return s;
2c2d71f5
JH
939
940 fail_finish: /* Substring not found */
33b8afdf
JH
941 if (prog->check_substr || prog->check_utf8) /* could be removed already */
942 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 943 fail:
2c2d71f5 944 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
945 PL_colors[4],PL_colors[5]));
946 return Nullch;
947}
9661b544 948
6eb5f6b9 949/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
950STATIC char *
951S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 952{
6eb5f6b9
JH
953 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
954 char *m;
d8093b23 955 STRLEN ln;
5dab1207 956 STRLEN lnc;
d8093b23
G
957 unsigned int c1;
958 unsigned int c2;
6eb5f6b9
JH
959 char *e;
960 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 961 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 962
6eb5f6b9
JH
963 /* We know what class it must start with. */
964 switch (OP(c)) {
6eb5f6b9 965 case ANYOF:
388cc4de
HS
966 if (do_utf8) {
967 while (s < strend) {
968 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
969 !UTF8_IS_INVARIANT((U8)s[0]) ?
970 reginclass(c, (U8*)s, 0, do_utf8) :
971 REGINCLASS(c, (U8*)s)) {
972 if (tmp && (norun || regtry(prog, s)))
973 goto got_it;
974 else
975 tmp = doevery;
976 }
977 else
978 tmp = 1;
979 s += UTF8SKIP(s);
980 }
981 }
982 else {
983 while (s < strend) {
984 STRLEN skip = 1;
985
986 if (REGINCLASS(c, (U8*)s) ||
987 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
988 /* The assignment of 2 is intentional:
989 * for the folded sharp s, the skip is 2. */
990 (skip = SHARP_S_SKIP))) {
991 if (tmp && (norun || regtry(prog, s)))
992 goto got_it;
993 else
994 tmp = doevery;
995 }
996 else
997 tmp = 1;
998 s += skip;
999 }
a0d0e21e 1000 }
6eb5f6b9 1001 break;
f33976b4
DB
1002 case CANY:
1003 while (s < strend) {
1004 if (tmp && (norun || regtry(prog, s)))
1005 goto got_it;
1006 else
1007 tmp = doevery;
1008 s++;
1009 }
1010 break;
6eb5f6b9 1011 case EXACTF:
5dab1207
NIS
1012 m = STRING(c);
1013 ln = STR_LEN(c); /* length to match in octets/bytes */
1014 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1015 if (UTF) {
a2a2844f 1016 STRLEN ulen1, ulen2;
5dab1207 1017 U8 *sm = (U8 *) m;
e7ae6809
JH
1018 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1019 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1020
1021 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1022 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1023
872c91ae
JH
1024 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1025 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1026 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1027 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
5dab1207
NIS
1028 lnc = 0;
1029 while (sm < ((U8 *) m + ln)) {
1030 lnc++;
1031 sm += UTF8SKIP(sm);
1032 }
1aa99e6b
IH
1033 }
1034 else {
1035 c1 = *(U8*)m;
1036 c2 = PL_fold[c1];
1037 }
6eb5f6b9
JH
1038 goto do_exactf;
1039 case EXACTFL:
5dab1207
NIS
1040 m = STRING(c);
1041 ln = STR_LEN(c);
1042 lnc = (I32) ln;
d8093b23 1043 c1 = *(U8*)m;
6eb5f6b9
JH
1044 c2 = PL_fold_locale[c1];
1045 do_exactf:
db12adc6 1046 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1047
6eb5f6b9
JH
1048 if (norun && e < s)
1049 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1050
60a8b682
JH
1051 /* The idea in the EXACTF* cases is to first find the
1052 * first character of the EXACTF* node and then, if
1053 * necessary, case-insensitively compare the full
1054 * text of the node. The c1 and c2 are the first
1055 * characters (though in Unicode it gets a bit
1056 * more complicated because there are more cases
7f16dd3d
JH
1057 * than just upper and lower: one needs to use
1058 * the so-called folding case for case-insensitive
1059 * matching (called "loose matching" in Unicode).
1060 * ibcmp_utf8() will do just that. */
60a8b682 1061
1aa99e6b 1062 if (do_utf8) {
575cac57
JH
1063 UV c, f;
1064 U8 tmpbuf [UTF8_MAXLEN+1];
1065 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1066 STRLEN len, foldlen;
d7f013c8 1067
09091399 1068 if (c1 == c2) {
5dab1207
NIS
1069 /* Upper and lower of 1st char are equal -
1070 * probably not a "letter". */
1aa99e6b 1071 while (s <= e) {
872c91ae
JH
1072 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1073 ckWARN(WARN_UTF8) ?
1074 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1075 if ( c == c1
1076 && (ln == len ||
66423254 1077 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1078 m, (char **)0, ln, (bool)UTF))
55da9344 1079 && (norun || regtry(prog, s)) )
1aa99e6b 1080 goto got_it;
80aecb99
JH
1081 else {
1082 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1083 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1084 if ( f != c
1085 && (f == c1 || f == c2)
1086 && (ln == foldlen ||
66423254
JH
1087 !ibcmp_utf8((char *) foldbuf,
1088 (char **)0, foldlen, do_utf8,
d07ddd77 1089 m,
eb160463 1090 (char **)0, ln, (bool)UTF))
80aecb99
JH
1091 && (norun || regtry(prog, s)) )
1092 goto got_it;
1093 }
1aa99e6b
IH
1094 s += len;
1095 }
09091399
JH
1096 }
1097 else {
1aa99e6b 1098 while (s <= e) {
872c91ae
JH
1099 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1100 ckWARN(WARN_UTF8) ?
1101 0 : UTF8_ALLOW_ANY);
80aecb99 1102
60a8b682 1103 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1104 * Note that not all the possible combinations
1105 * are handled here: some of them are handled
1106 * by the standard folding rules, and some of
1107 * them (the character class or ANYOF cases)
1108 * are handled during compiletime in
1109 * regexec.c:S_regclass(). */
880bd946
JH
1110 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1111 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1112 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1113
1114 if ( (c == c1 || c == c2)
1115 && (ln == len ||
66423254 1116 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1117 m, (char **)0, ln, (bool)UTF))
55da9344 1118 && (norun || regtry(prog, s)) )
1aa99e6b 1119 goto got_it;
80aecb99
JH
1120 else {
1121 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1122 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1123 if ( f != c
1124 && (f == c1 || f == c2)
1125 && (ln == foldlen ||
a6872d42 1126 !ibcmp_utf8((char *) foldbuf,
66423254 1127 (char **)0, foldlen, do_utf8,
d07ddd77 1128 m,
eb160463 1129 (char **)0, ln, (bool)UTF))
80aecb99
JH
1130 && (norun || regtry(prog, s)) )
1131 goto got_it;
1132 }
1aa99e6b
IH
1133 s += len;
1134 }
09091399 1135 }
1aa99e6b
IH
1136 }
1137 else {
1138 if (c1 == c2)
1139 while (s <= e) {
1140 if ( *(U8*)s == c1
1141 && (ln == 1 || !(OP(c) == EXACTF
1142 ? ibcmp(s, m, ln)
1143 : ibcmp_locale(s, m, ln)))
1144 && (norun || regtry(prog, s)) )
1145 goto got_it;
1146 s++;
1147 }
1148 else
1149 while (s <= e) {
1150 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1151 && (ln == 1 || !(OP(c) == EXACTF
1152 ? ibcmp(s, m, ln)
1153 : ibcmp_locale(s, m, ln)))
1154 && (norun || regtry(prog, s)) )
1155 goto got_it;
1156 s++;
1157 }
b3c9acc1
IZ
1158 }
1159 break;
bbce6d69 1160 case BOUNDL:
3280af22 1161 PL_reg_flags |= RF_tainted;
bbce6d69 1162 /* FALL THROUGH */
a0d0e21e 1163 case BOUND:
ffc61ed2 1164 if (do_utf8) {
12d33761 1165 if (s == PL_bostr)
ffc61ed2
JH
1166 tmp = '\n';
1167 else {
b4f7163a 1168 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1169
b4f7163a 1170 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1171 }
1172 tmp = ((OP(c) == BOUND ?
9041c2e3 1173 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1174 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1175 while (s < strend) {
1176 if (tmp == !(OP(c) == BOUND ?
3568d838 1177 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1178 isALNUM_LC_utf8((U8*)s)))
1179 {
1180 tmp = !tmp;
1181 if ((norun || regtry(prog, s)))
1182 goto got_it;
1183 }
1184 s += UTF8SKIP(s);
a687059c 1185 }
a0d0e21e 1186 }
667bb95a 1187 else {
12d33761 1188 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1189 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1190 while (s < strend) {
1191 if (tmp ==
1192 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1193 tmp = !tmp;
1194 if ((norun || regtry(prog, s)))
1195 goto got_it;
1196 }
1197 s++;
a0ed51b3 1198 }
a0ed51b3 1199 }
6eb5f6b9 1200 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1201 goto got_it;
1202 break;
bbce6d69 1203 case NBOUNDL:
3280af22 1204 PL_reg_flags |= RF_tainted;
bbce6d69 1205 /* FALL THROUGH */
a0d0e21e 1206 case NBOUND:
ffc61ed2 1207 if (do_utf8) {
12d33761 1208 if (s == PL_bostr)
ffc61ed2
JH
1209 tmp = '\n';
1210 else {
b4f7163a 1211 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1212
b4f7163a 1213 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1214 }
1215 tmp = ((OP(c) == NBOUND ?
9041c2e3 1216 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1217 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1218 while (s < strend) {
1219 if (tmp == !(OP(c) == NBOUND ?
3568d838 1220 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1221 isALNUM_LC_utf8((U8*)s)))
1222 tmp = !tmp;
1223 else if ((norun || regtry(prog, s)))
1224 goto got_it;
1225 s += UTF8SKIP(s);
1226 }
a0d0e21e 1227 }
667bb95a 1228 else {
12d33761 1229 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1230 tmp = ((OP(c) == NBOUND ?
1231 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1232 while (s < strend) {
1233 if (tmp ==
1234 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1235 tmp = !tmp;
1236 else if ((norun || regtry(prog, s)))
1237 goto got_it;
1238 s++;
1239 }
a0ed51b3 1240 }
6eb5f6b9 1241 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1242 goto got_it;
1243 break;
a0d0e21e 1244 case ALNUM:
ffc61ed2 1245 if (do_utf8) {
8269fa76 1246 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1247 while (s < strend) {
3568d838 1248 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1249 if (tmp && (norun || regtry(prog, s)))
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
bbce6d69 1254 else
ffc61ed2
JH
1255 tmp = 1;
1256 s += UTF8SKIP(s);
bbce6d69 1257 }
bbce6d69 1258 }
ffc61ed2
JH
1259 else {
1260 while (s < strend) {
1261 if (isALNUM(*s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a0ed51b3 1267 else
ffc61ed2
JH
1268 tmp = 1;
1269 s++;
a0ed51b3 1270 }
a0ed51b3
LW
1271 }
1272 break;
bbce6d69 1273 case ALNUML:
3280af22 1274 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1275 if (do_utf8) {
1276 while (s < strend) {
1277 if (isALNUM_LC_utf8((U8*)s)) {
1278 if (tmp && (norun || regtry(prog, s)))
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
a687059c 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s += UTF8SKIP(s);
a0d0e21e 1286 }
a0d0e21e 1287 }
ffc61ed2
JH
1288 else {
1289 while (s < strend) {
1290 if (isALNUM_LC(*s)) {
1291 if (tmp && (norun || regtry(prog, s)))
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
a0ed51b3 1296 else
ffc61ed2
JH
1297 tmp = 1;
1298 s++;
a0ed51b3 1299 }
a0ed51b3
LW
1300 }
1301 break;
a0d0e21e 1302 case NALNUM:
ffc61ed2 1303 if (do_utf8) {
8269fa76 1304 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1305 while (s < strend) {
3568d838 1306 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1307 if (tmp && (norun || regtry(prog, s)))
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
bbce6d69 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s += UTF8SKIP(s);
bbce6d69 1315 }
bbce6d69 1316 }
ffc61ed2
JH
1317 else {
1318 while (s < strend) {
1319 if (!isALNUM(*s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a0ed51b3 1325 else
ffc61ed2
JH
1326 tmp = 1;
1327 s++;
a0ed51b3 1328 }
a0ed51b3
LW
1329 }
1330 break;
bbce6d69 1331 case NALNUML:
3280af22 1332 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1333 if (do_utf8) {
1334 while (s < strend) {
1335 if (!isALNUM_LC_utf8((U8*)s)) {
1336 if (tmp && (norun || regtry(prog, s)))
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a687059c 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s += UTF8SKIP(s);
a687059c 1344 }
a0d0e21e 1345 }
ffc61ed2
JH
1346 else {
1347 while (s < strend) {
1348 if (!isALNUM_LC(*s)) {
1349 if (tmp && (norun || regtry(prog, s)))
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0ed51b3 1354 else
ffc61ed2
JH
1355 tmp = 1;
1356 s++;
a0ed51b3 1357 }
a0ed51b3
LW
1358 }
1359 break;
a0d0e21e 1360 case SPACE:
ffc61ed2 1361 if (do_utf8) {
8269fa76 1362 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1363 while (s < strend) {
3568d838 1364 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1365 if (tmp && (norun || regtry(prog, s)))
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
a0d0e21e 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s += UTF8SKIP(s);
2304df62 1373 }
a0d0e21e 1374 }
ffc61ed2
JH
1375 else {
1376 while (s < strend) {
1377 if (isSPACE(*s)) {
1378 if (tmp && (norun || regtry(prog, s)))
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
a0ed51b3 1383 else
ffc61ed2
JH
1384 tmp = 1;
1385 s++;
a0ed51b3 1386 }
a0ed51b3
LW
1387 }
1388 break;
bbce6d69 1389 case SPACEL:
3280af22 1390 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1391 if (do_utf8) {
1392 while (s < strend) {
1393 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1394 if (tmp && (norun || regtry(prog, s)))
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
bbce6d69 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s += UTF8SKIP(s);
bbce6d69 1402 }
bbce6d69 1403 }
ffc61ed2
JH
1404 else {
1405 while (s < strend) {
1406 if (isSPACE_LC(*s)) {
1407 if (tmp && (norun || regtry(prog, s)))
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0ed51b3 1412 else
ffc61ed2
JH
1413 tmp = 1;
1414 s++;
a0ed51b3 1415 }
a0ed51b3
LW
1416 }
1417 break;
a0d0e21e 1418 case NSPACE:
ffc61ed2 1419 if (do_utf8) {
8269fa76 1420 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1421 while (s < strend) {
3568d838 1422 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1423 if (tmp && (norun || regtry(prog, s)))
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
a0d0e21e 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s += UTF8SKIP(s);
a687059c 1431 }
a0d0e21e 1432 }
ffc61ed2
JH
1433 else {
1434 while (s < strend) {
1435 if (!isSPACE(*s)) {
1436 if (tmp && (norun || regtry(prog, s)))
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
a0ed51b3 1441 else
ffc61ed2
JH
1442 tmp = 1;
1443 s++;
a0ed51b3 1444 }
a0ed51b3
LW
1445 }
1446 break;
bbce6d69 1447 case NSPACEL:
3280af22 1448 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1449 if (do_utf8) {
1450 while (s < strend) {
1451 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1452 if (tmp && (norun || regtry(prog, s)))
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
bbce6d69 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s += UTF8SKIP(s);
bbce6d69 1460 }
bbce6d69 1461 }
ffc61ed2
JH
1462 else {
1463 while (s < strend) {
1464 if (!isSPACE_LC(*s)) {
1465 if (tmp && (norun || regtry(prog, s)))
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0ed51b3 1470 else
ffc61ed2
JH
1471 tmp = 1;
1472 s++;
a0ed51b3 1473 }
a0ed51b3
LW
1474 }
1475 break;
a0d0e21e 1476 case DIGIT:
ffc61ed2 1477 if (do_utf8) {
8269fa76 1478 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1479 while (s < strend) {
3568d838 1480 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1481 if (tmp && (norun || regtry(prog, s)))
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
a0d0e21e 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s += UTF8SKIP(s);
2b69d0c2 1489 }
a0d0e21e 1490 }
ffc61ed2
JH
1491 else {
1492 while (s < strend) {
1493 if (isDIGIT(*s)) {
1494 if (tmp && (norun || regtry(prog, s)))
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
a0ed51b3 1499 else
ffc61ed2
JH
1500 tmp = 1;
1501 s++;
a0ed51b3 1502 }
a0ed51b3
LW
1503 }
1504 break;
b8c5462f
JH
1505 case DIGITL:
1506 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1507 if (do_utf8) {
1508 while (s < strend) {
1509 if (isDIGIT_LC_utf8((U8*)s)) {
1510 if (tmp && (norun || regtry(prog, s)))
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
b8c5462f 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s += UTF8SKIP(s);
b8c5462f 1518 }
b8c5462f 1519 }
ffc61ed2
JH
1520 else {
1521 while (s < strend) {
1522 if (isDIGIT_LC(*s)) {
1523 if (tmp && (norun || regtry(prog, s)))
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
b8c5462f 1528 else
ffc61ed2
JH
1529 tmp = 1;
1530 s++;
b8c5462f 1531 }
b8c5462f
JH
1532 }
1533 break;
a0d0e21e 1534 case NDIGIT:
ffc61ed2 1535 if (do_utf8) {
8269fa76 1536 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1537 while (s < strend) {
3568d838 1538 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1539 if (tmp && (norun || regtry(prog, s)))
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
a0d0e21e 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s += UTF8SKIP(s);
a687059c 1547 }
a0d0e21e 1548 }
ffc61ed2
JH
1549 else {
1550 while (s < strend) {
1551 if (!isDIGIT(*s)) {
1552 if (tmp && (norun || regtry(prog, s)))
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
a0ed51b3 1557 else
ffc61ed2
JH
1558 tmp = 1;
1559 s++;
a0ed51b3 1560 }
a0ed51b3
LW
1561 }
1562 break;
b8c5462f
JH
1563 case NDIGITL:
1564 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1565 if (do_utf8) {
1566 while (s < strend) {
1567 if (!isDIGIT_LC_utf8((U8*)s)) {
1568 if (tmp && (norun || regtry(prog, s)))
1569 goto got_it;
1570 else
1571 tmp = doevery;
1572 }
b8c5462f 1573 else
ffc61ed2
JH
1574 tmp = 1;
1575 s += UTF8SKIP(s);
b8c5462f 1576 }
a0ed51b3 1577 }
ffc61ed2
JH
1578 else {
1579 while (s < strend) {
1580 if (!isDIGIT_LC(*s)) {
1581 if (tmp && (norun || regtry(prog, s)))
1582 goto got_it;
1583 else
1584 tmp = doevery;
1585 }
cf93c79d 1586 else
ffc61ed2
JH
1587 tmp = 1;
1588 s++;
b8c5462f 1589 }
b8c5462f
JH
1590 }
1591 break;
b3c9acc1 1592 default:
3c3eec57
GS
1593 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1594 break;
d6a28714 1595 }
6eb5f6b9
JH
1596 return 0;
1597 got_it:
1598 return s;
1599}
1600
1601/*
1602 - regexec_flags - match a regexp against a string
1603 */
1604I32
1605Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1606 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1607/* strend: pointer to null at end of string */
1608/* strbeg: real beginning of string */
1609/* minend: end of match must be >=minend after stringarg. */
1610/* data: May be used for some additional optimizations. */
1611/* nosave: For optimizations. */
1612{
6eb5f6b9
JH
1613 register char *s;
1614 register regnode *c;
1615 register char *startpos = stringarg;
6eb5f6b9
JH
1616 I32 minlen; /* must match at least this many chars */
1617 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1618 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1619 constant substr. */ /* CC */
1620 I32 end_shift = 0; /* Same for the end. */ /* CC */
1621 I32 scream_pos = -1; /* Internal iterator of scream. */
1622 char *scream_olds;
1623 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1624 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1625#ifdef DEBUGGING
9e55ce06
JH
1626 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1627 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1628#endif
a30b2f1f 1629 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1630
1631 PL_regcc = 0;
1632
1633 cache_re(prog);
1634#ifdef DEBUGGING
aea4f609 1635 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1636#endif
1637
1638 /* Be paranoid... */
1639 if (prog == NULL || startpos == NULL) {
1640 Perl_croak(aTHX_ "NULL regexp parameter");
1641 return 0;
1642 }
1643
1644 minlen = prog->minlen;
61a36c01 1645 if (strend - startpos < minlen) {
a72c7584
JH
1646 DEBUG_r(PerlIO_printf(Perl_debug_log,
1647 "String too short [regexec_flags]...\n"));
1648 goto phooey;
1aa99e6b 1649 }
6eb5f6b9 1650
6eb5f6b9
JH
1651 /* Check validity of program. */
1652 if (UCHARAT(prog->program) != REG_MAGIC) {
1653 Perl_croak(aTHX_ "corrupted regexp program");
1654 }
1655
1656 PL_reg_flags = 0;
1657 PL_reg_eval_set = 0;
1658 PL_reg_maxiter = 0;
1659
1660 if (prog->reganch & ROPT_UTF8)
1661 PL_reg_flags |= RF_utf8;
1662
1663 /* Mark beginning of line for ^ and lookbehind. */
1664 PL_regbol = startpos;
1665 PL_bostr = strbeg;
1666 PL_reg_sv = sv;
1667
1668 /* Mark end of line for $ (and such) */
1669 PL_regeol = strend;
1670
1671 /* see how far we have to get to not match where we matched before */
1672 PL_regtill = startpos+minend;
1673
1674 /* We start without call_cc context. */
1675 PL_reg_call_cc = 0;
1676
1677 /* If there is a "must appear" string, look for it. */
1678 s = startpos;
1679
1680 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1681 MAGIC *mg;
1682
1683 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1684 PL_reg_ganch = startpos;
1685 else if (sv && SvTYPE(sv) >= SVt_PVMG
1686 && SvMAGIC(sv)
14befaf4
DM
1687 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1688 && mg->mg_len >= 0) {
6eb5f6b9
JH
1689 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1690 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1691 if (s > PL_reg_ganch)
6eb5f6b9
JH
1692 goto phooey;
1693 s = PL_reg_ganch;
1694 }
1695 }
1696 else /* pos() not defined */
1697 PL_reg_ganch = strbeg;
1698 }
1699
33b8afdf 1700 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1701 re_scream_pos_data d;
1702
1703 d.scream_olds = &scream_olds;
1704 d.scream_pos = &scream_pos;
1705 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1706 if (!s) {
1707 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1708 goto phooey; /* not present */
3fa9c3d7 1709 }
6eb5f6b9
JH
1710 }
1711
2a782b5b 1712 DEBUG_r({
9e55ce06
JH
1713 char *s0 = UTF ?
1714 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1715 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1716 prog->precomp;
1717 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1718 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1719 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1720 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1721 if (!PL_colorset)
1722 reginitcolors();
1723 PerlIO_printf(Perl_debug_log,
9e55ce06 1724 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1725 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1726 len0, len0, s0,
2a782b5b 1727 PL_colors[1],
9e55ce06 1728 len0 > 60 ? "..." : "",
2a782b5b 1729 PL_colors[0],
9e55ce06
JH
1730 (int)(len1 > 60 ? 60 : len1),
1731 s1, PL_colors[1],
1732 (len1 > 60 ? "..." : "")
2a782b5b
JH
1733 );
1734 });
6eb5f6b9
JH
1735
1736 /* Simplest case: anchored match need be tried only once. */
1737 /* [unless only anchor is BOL and multiline is set] */
1738 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1739 if (s == startpos && regtry(prog, startpos))
1740 goto got_it;
1741 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1742 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1743 {
1744 char *end;
1745
1746 if (minlen)
1747 dontbother = minlen - 1;
1aa99e6b 1748 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1749 /* for multiline we only have to try after newlines */
33b8afdf 1750 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1751 if (s == startpos)
1752 goto after_try;
1753 while (1) {
1754 if (regtry(prog, s))
1755 goto got_it;
1756 after_try:
1757 if (s >= end)
1758 goto phooey;
1759 if (prog->reganch & RE_USE_INTUIT) {
1760 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1761 if (!s)
1762 goto phooey;
1763 }
1764 else
1765 s++;
1766 }
1767 } else {
1768 if (s > startpos)
1769 s--;
1770 while (s < end) {
1771 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1772 if (regtry(prog, s))
1773 goto got_it;
1774 }
1775 }
1776 }
1777 }
1778 goto phooey;
1779 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1780 if (regtry(prog, PL_reg_ganch))
1781 goto got_it;
1782 goto phooey;
1783 }
1784
1785 /* Messy cases: unanchored match. */
33b8afdf 1786 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1787 /* we have /x+whatever/ */
1788 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1789 char ch;
bf93d4cc
GS
1790#ifdef DEBUGGING
1791 int did_match = 0;
1792#endif
33b8afdf
JH
1793 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1794 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1795 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1796
1aa99e6b 1797 if (do_utf8) {
6eb5f6b9
JH
1798 while (s < strend) {
1799 if (*s == ch) {
bf93d4cc 1800 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1801 if (regtry(prog, s)) goto got_it;
1802 s += UTF8SKIP(s);
1803 while (s < strend && *s == ch)
1804 s += UTF8SKIP(s);
1805 }
1806 s += UTF8SKIP(s);
1807 }
1808 }
1809 else {
1810 while (s < strend) {
1811 if (*s == ch) {
bf93d4cc 1812 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1813 if (regtry(prog, s)) goto got_it;
1814 s++;
1815 while (s < strend && *s == ch)
1816 s++;
1817 }
1818 s++;
1819 }
1820 }
b7953727 1821 DEBUG_r(if (!did_match)
bf93d4cc 1822 PerlIO_printf(Perl_debug_log,
b7953727
JH
1823 "Did not find anchored character...\n")
1824 );
6eb5f6b9
JH
1825 }
1826 /*SUPPRESS 560*/
33b8afdf
JH
1827 else if (prog->anchored_substr != Nullsv
1828 || prog->anchored_utf8 != Nullsv
1829 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1830 && prog->float_max_offset < strend - s)) {
1831 SV *must;
1832 I32 back_max;
1833 I32 back_min;
1834 char *last;
6eb5f6b9 1835 char *last1; /* Last position checked before */
bf93d4cc
GS
1836#ifdef DEBUGGING
1837 int did_match = 0;
1838#endif
33b8afdf
JH
1839 if (prog->anchored_substr || prog->anchored_utf8) {
1840 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1841 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1842 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1843 back_max = back_min = prog->anchored_offset;
1844 } else {
1845 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1846 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1847 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1848 back_max = prog->float_max_offset;
1849 back_min = prog->float_min_offset;
1850 }
1851 if (must == &PL_sv_undef)
1852 /* could not downgrade utf8 check substring, so must fail */
1853 goto phooey;
1854
1855 last = HOP3c(strend, /* Cannot start after this */
1856 -(I32)(CHR_SVLEN(must)
1857 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1858
1859 if (s > PL_bostr)
1860 last1 = HOPc(s, -1);
1861 else
1862 last1 = s - 1; /* bogus */
1863
1864 /* XXXX check_substr already used to find `s', can optimize if
1865 check_substr==must. */
1866 scream_pos = -1;
1867 dontbother = end_shift;
1868 strend = HOPc(strend, -dontbother);
1869 while ( (s <= last) &&
9041c2e3 1870 ((flags & REXEC_SCREAM)
1aa99e6b 1871 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1872 end_shift, &scream_pos, 0))
1aa99e6b 1873 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1874 (unsigned char*)strend, must,
6eb5f6b9 1875 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1876 /* we may be pointing at the wrong string */
1877 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1878 s = strbeg + (s - SvPVX(sv));
bf93d4cc 1879 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1880 if (HOPc(s, -back_max) > last1) {
1881 last1 = HOPc(s, -back_min);
1882 s = HOPc(s, -back_max);
1883 }
1884 else {
1885 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1886
1887 last1 = HOPc(s, -back_min);
1888 s = t;
1889 }
1aa99e6b 1890 if (do_utf8) {
6eb5f6b9
JH
1891 while (s <= last1) {
1892 if (regtry(prog, s))
1893 goto got_it;
1894 s += UTF8SKIP(s);
1895 }
1896 }
1897 else {
1898 while (s <= last1) {
1899 if (regtry(prog, s))
1900 goto got_it;
1901 s++;
1902 }
1903 }
1904 }
b7953727
JH
1905 DEBUG_r(if (!did_match)
1906 PerlIO_printf(Perl_debug_log,
1907 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1908 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1909 ? "anchored" : "floating"),
1910 PL_colors[0],
1911 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1912 SvPVX(must),
b7953727
JH
1913 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1914 );
6eb5f6b9
JH
1915 goto phooey;
1916 }
155aba94 1917 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1918 if (minlen) {
1919 I32 op = (U8)OP(prog->regstclass);
66e933ab 1920 /* don't bother with what can't match */
f14c76ed
RGS
1921 if (PL_regkind[op] != EXACT && op != CANY)
1922 strend = HOPc(strend, -(minlen - 1));
1923 }
ffc61ed2
JH
1924 DEBUG_r({
1925 SV *prop = sv_newmortal();
9e55ce06
JH
1926 char *s0;
1927 char *s1;
1928 int len0;
1929 int len1;
1930
ffc61ed2 1931 regprop(prop, c);
9e55ce06
JH
1932 s0 = UTF ?
1933 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1934 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1935 SvPVX(prop);
1936 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1937 s1 = UTF ?
c728cb41 1938 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1939 len1 = UTF ? SvCUR(dsv1) : strend - s;
1940 PerlIO_printf(Perl_debug_log,
1941 "Matching stclass `%*.*s' against `%*.*s'\n",
1942 len0, len0, s0,
1943 len1, len1, s1);
ffc61ed2 1944 });
6eb5f6b9
JH
1945 if (find_byclass(prog, c, s, strend, startpos, 0))
1946 goto got_it;
bf93d4cc 1947 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1948 }
1949 else {
1950 dontbother = 0;
33b8afdf
JH
1951 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1952 /* Trim the end. */
d6a28714 1953 char *last;
33b8afdf
JH
1954 SV* float_real;
1955
1956 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1957 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1958 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1959
1960 if (flags & REXEC_SCREAM) {
33b8afdf 1961 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1962 end_shift, &scream_pos, 1); /* last one */
1963 if (!last)
ffc61ed2 1964 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1965 /* we may be pointing at the wrong string */
1966 else if (RX_MATCH_COPIED(prog))
7ef91622 1967 s = strbeg + (s - SvPVX(sv));
b8c5462f 1968 }
d6a28714
JH
1969 else {
1970 STRLEN len;
33b8afdf 1971 char *little = SvPV(float_real, len);
d6a28714 1972
33b8afdf 1973 if (SvTAIL(float_real)) {
d6a28714
JH
1974 if (memEQ(strend - len + 1, little, len - 1))
1975 last = strend - len + 1;
1976 else if (!PL_multiline)
9041c2e3 1977 last = memEQ(strend - len, little, len)
d6a28714 1978 ? strend - len : Nullch;
b8c5462f 1979 else
d6a28714
JH
1980 goto find_last;
1981 } else {
1982 find_last:
9041c2e3 1983 if (len)
d6a28714 1984 last = rninstr(s, strend, little, little + len);
b8c5462f 1985 else
d6a28714 1986 last = strend; /* matching `$' */
b8c5462f 1987 }
b8c5462f 1988 }
bf93d4cc
GS
1989 if (last == NULL) {
1990 DEBUG_r(PerlIO_printf(Perl_debug_log,
1991 "%sCan't trim the tail, match fails (should not happen)%s\n",
1992 PL_colors[4],PL_colors[5]));
1993 goto phooey; /* Should not happen! */
1994 }
d6a28714
JH
1995 dontbother = strend - last + prog->float_min_offset;
1996 }
1997 if (minlen && (dontbother < minlen))
1998 dontbother = minlen - 1;
1999 strend -= dontbother; /* this one's always in bytes! */
2000 /* We don't know much -- general case. */
1aa99e6b 2001 if (do_utf8) {
d6a28714
JH
2002 for (;;) {
2003 if (regtry(prog, s))
2004 goto got_it;
2005 if (s >= strend)
2006 break;
b8c5462f 2007 s += UTF8SKIP(s);
d6a28714
JH
2008 };
2009 }
2010 else {
2011 do {
2012 if (regtry(prog, s))
2013 goto got_it;
2014 } while (s++ < strend);
2015 }
2016 }
2017
2018 /* Failure. */
2019 goto phooey;
2020
2021got_it:
2022 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2023
2024 if (PL_reg_eval_set) {
2025 /* Preserve the current value of $^R */
2026 if (oreplsv != GvSV(PL_replgv))
2027 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2028 restored, the value remains
2029 the same. */
acfe0abc 2030 restore_pos(aTHX_ 0);
d6a28714
JH
2031 }
2032
2033 /* make sure $`, $&, $', and $digit will work later */
2034 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2035 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2036 if (flags & REXEC_COPY_STR) {
2037 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2038#ifdef PERL_COPY_ON_WRITE
2039 if ((SvIsCOW(sv)
2040 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2041 if (DEBUG_C_TEST) {
2042 PerlIO_printf(Perl_debug_log,
2043 "Copy on write: regexp capture, type %d\n",
2044 (int) SvTYPE(sv));
2045 }
2046 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2047 prog->subbeg = SvPVX(prog->saved_copy);
2048 assert (SvPOKp(prog->saved_copy));
2049 } else
2050#endif
2051 {
2052 RX_MATCH_COPIED_on(prog);
2053 s = savepvn(strbeg, i);
2054 prog->subbeg = s;
2055 }
d6a28714 2056 prog->sublen = i;
d6a28714
JH
2057 }
2058 else {
2059 prog->subbeg = strbeg;
2060 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2061 }
2062 }
9041c2e3 2063
d6a28714
JH
2064 return 1;
2065
2066phooey:
bf93d4cc
GS
2067 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2068 PL_colors[4],PL_colors[5]));
d6a28714 2069 if (PL_reg_eval_set)
acfe0abc 2070 restore_pos(aTHX_ 0);
d6a28714
JH
2071 return 0;
2072}
2073
2074/*
2075 - regtry - try match at specific point
2076 */
2077STATIC I32 /* 0 failure, 1 success */
2078S_regtry(pTHX_ regexp *prog, char *startpos)
2079{
d6a28714
JH
2080 register I32 i;
2081 register I32 *sp;
2082 register I32 *ep;
2083 CHECKPOINT lastcp;
2084
02db2b7b
IZ
2085#ifdef DEBUGGING
2086 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2087#endif
d6a28714
JH
2088 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2089 MAGIC *mg;
2090
2091 PL_reg_eval_set = RS_init;
2092 DEBUG_r(DEBUG_s(
b900a521
JH
2093 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2094 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2095 ));
e8347627 2096 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2097 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2098 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2099 SAVETMPS;
2100 /* Apparently this is not needed, judging by wantarray. */
e8347627 2101 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2102 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2103
2104 if (PL_reg_sv) {
2105 /* Make $_ available to executed code. */
2106 if (PL_reg_sv != DEFSV) {
59f00321 2107 SAVE_DEFSV;
d6a28714 2108 DEFSV = PL_reg_sv;
b8c5462f 2109 }
d6a28714 2110
9041c2e3 2111 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2112 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2113 /* prepare for quick setting of pos */
14befaf4
DM
2114 sv_magic(PL_reg_sv, (SV*)0,
2115 PERL_MAGIC_regex_global, Nullch, 0);
2116 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2117 mg->mg_len = -1;
b8c5462f 2118 }
d6a28714
JH
2119 PL_reg_magic = mg;
2120 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2121 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2122 }
09687e5a 2123 if (!PL_reg_curpm) {
0f79a09d 2124 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2125#ifdef USE_ITHREADS
2126 {
2127 SV* repointer = newSViv(0);
577e12cc 2128 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2129 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2130 av_push(PL_regex_padav,repointer);
2131 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2132 PL_regex_pad = AvARRAY(PL_regex_padav);
2133 }
2134#endif
2135 }
aaa362c4 2136 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2137 PL_reg_oldcurpm = PL_curpm;
2138 PL_curpm = PL_reg_curpm;
2139 if (RX_MATCH_COPIED(prog)) {
2140 /* Here is a serious problem: we cannot rewrite subbeg,
2141 since it may be needed if this match fails. Thus
2142 $` inside (?{}) could fail... */
2143 PL_reg_oldsaved = prog->subbeg;
2144 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2145#ifdef PERL_COPY_ON_WRITE
2146 PL_nrs = prog->saved_copy;
2147#endif
d6a28714
JH
2148 RX_MATCH_COPIED_off(prog);
2149 }
2150 else
2151 PL_reg_oldsaved = Nullch;
2152 prog->subbeg = PL_bostr;
2153 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2154 }
2155 prog->startp[0] = startpos - PL_bostr;
2156 PL_reginput = startpos;
2157 PL_regstartp = prog->startp;
2158 PL_regendp = prog->endp;
2159 PL_reglastparen = &prog->lastparen;
a01268b5 2160 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2161 prog->lastparen = 0;
03994de8 2162 prog->lastcloseparen = 0;
d6a28714
JH
2163 PL_regsize = 0;
2164 DEBUG_r(PL_reg_starttry = startpos);
2165 if (PL_reg_start_tmpl <= prog->nparens) {
2166 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2167 if(PL_reg_start_tmp)
2168 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2169 else
2170 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2171 }
2172
2173 /* XXXX What this code is doing here?!!! There should be no need
2174 to do this again and again, PL_reglastparen should take care of
3dd2943c 2175 this! --ilya*/
dafc8851
JH
2176
2177 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2178 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2179 * PL_reglastparen), is not needed at all by the test suite
2180 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2181 * enough, for building DynaLoader, or otherwise this
2182 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2183 * will happen. Meanwhile, this code *is* needed for the
2184 * above-mentioned test suite tests to succeed. The common theme
2185 * on those tests seems to be returning null fields from matches.
2186 * --jhi */
dafc8851 2187#if 1
d6a28714
JH
2188 sp = prog->startp;
2189 ep = prog->endp;
2190 if (prog->nparens) {
eb160463 2191 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2192 *++sp = -1;
2193 *++ep = -1;
2194 }
2195 }
dafc8851 2196#endif
02db2b7b 2197 REGCP_SET(lastcp);
d6a28714
JH
2198 if (regmatch(prog->program + 1)) {
2199 prog->endp[0] = PL_reginput - PL_bostr;
2200 return 1;
2201 }
02db2b7b 2202 REGCP_UNWIND(lastcp);
d6a28714
JH
2203 return 0;
2204}
2205
02db2b7b
IZ
2206#define RE_UNWIND_BRANCH 1
2207#define RE_UNWIND_BRANCHJ 2
2208
2209union re_unwind_t;
2210
2211typedef struct { /* XX: makes sense to enlarge it... */
2212 I32 type;
2213 I32 prev;
2214 CHECKPOINT lastcp;
2215} re_unwind_generic_t;
2216
2217typedef struct {
2218 I32 type;
2219 I32 prev;
2220 CHECKPOINT lastcp;
2221 I32 lastparen;
2222 regnode *next;
2223 char *locinput;
2224 I32 nextchr;
2225#ifdef DEBUGGING
2226 int regindent;
2227#endif
2228} re_unwind_branch_t;
2229
2230typedef union re_unwind_t {
2231 I32 type;
2232 re_unwind_generic_t generic;
2233 re_unwind_branch_t branch;
2234} re_unwind_t;
2235
8ba1375e
MJD
2236#define sayYES goto yes
2237#define sayNO goto no
e0f9d4a8 2238#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2239#define sayYES_FINAL goto yes_final
2240#define sayYES_LOUD goto yes_loud
2241#define sayNO_FINAL goto no_final
2242#define sayNO_SILENT goto do_no
2243#define saySAME(x) if (x) goto yes; else goto no
2244
2245#define REPORT_CODE_OFF 24
2246
d6a28714
JH
2247/*
2248 - regmatch - main matching routine
2249 *
2250 * Conceptually the strategy is simple: check to see whether the current
2251 * node matches, call self recursively to see whether the rest matches,
2252 * and then act accordingly. In practice we make some effort to avoid
2253 * recursion, in particular by going through "ordinary" nodes (that don't
2254 * need to know whether the rest of the match failed) by a loop instead of
2255 * by recursion.
2256 */
2257/* [lwall] I've hoisted the register declarations to the outer block in order to
2258 * maybe save a little bit of pushing and popping on the stack. It also takes
2259 * advantage of machines that use a register save mask on subroutine entry.
2260 */
2261STATIC I32 /* 0 failure, 1 success */
2262S_regmatch(pTHX_ regnode *prog)
2263{
d6a28714
JH
2264 register regnode *scan; /* Current node. */
2265 regnode *next; /* Next node. */
2266 regnode *inner; /* Next node in internal branch. */
2267 register I32 nextchr; /* renamed nextchr - nextchar colides with
2268 function of same name */
2269 register I32 n; /* no or next */
b7953727
JH
2270 register I32 ln = 0; /* len or last */
2271 register char *s = Nullch; /* operand or save */
d6a28714 2272 register char *locinput = PL_reginput;
b7953727 2273 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2274 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2275 I32 unwind = 0;
b7953727 2276#if 0
02db2b7b 2277 I32 firstcp = PL_savestack_ix;
b7953727 2278#endif
53c4c00c 2279 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2280#ifdef DEBUGGING
ce333219
JH
2281 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2282 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2283 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2284#endif
02db2b7b 2285
d6a28714
JH
2286#ifdef DEBUGGING
2287 PL_regindent++;
2288#endif
2289
2290 /* Note that nextchr is a byte even in UTF */
2291 nextchr = UCHARAT(locinput);
2292 scan = prog;
2293 while (scan != NULL) {
8ba1375e 2294
2a782b5b 2295 DEBUG_r( {
d6a28714
JH
2296 SV *prop = sv_newmortal();
2297 int docolor = *PL_colors[0];
2298 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2299 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2300 /* The part of the string before starttry has one color
2301 (pref0_len chars), between starttry and current
2302 position another one (pref_len - pref0_len chars),
2303 after the current position the third one.
2304 We assume that pref0_len <= pref_len, otherwise we
2305 decrease pref0_len. */
9041c2e3 2306 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2307 ? (5 + taill) - l : locinput - PL_bostr;
2308 int pref0_len;
d6a28714 2309
df1ffd02 2310 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2311 pref_len++;
2312 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2313 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2314 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2315 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2316 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2317 l--;
d6a28714
JH
2318 if (pref0_len < 0)
2319 pref0_len = 0;
2320 if (pref0_len > pref_len)
2321 pref0_len = pref_len;
2322 regprop(prop, scan);
2a782b5b
JH
2323 {
2324 char *s0 =
f14c76ed 2325 do_utf8 && OP(scan) != CANY ?
2a782b5b 2326 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2327 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2328 locinput - pref_len;
df1ffd02 2329 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2330 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2331 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2332 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2333 locinput - pref_len + pref0_len;
df1ffd02 2334 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2335 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2336 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2337 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2338 locinput;
df1ffd02 2339 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2340 PerlIO_printf(Perl_debug_log,
2341 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2342 (IV)(locinput - PL_bostr),
2343 PL_colors[4],
2344 len0, s0,
2345 PL_colors[5],
2346 PL_colors[2],
2347 len1, s1,
2348 PL_colors[3],
2349 (docolor ? "" : "> <"),
2350 PL_colors[0],
2351 len2, s2,
2352 PL_colors[1],
2353 15 - l - pref_len + 1,
2354 "",
2355 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2356 SvPVX(prop));
2357 }
2358 });
d6a28714
JH
2359
2360 next = scan + NEXT_OFF(scan);
2361 if (next == scan)
2362 next = NULL;
2363
2364 switch (OP(scan)) {
2365 case BOL:
12d33761
HS
2366 if (locinput == PL_bostr || (PL_multiline &&
2367 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2368 {
2369 /* regtill = regbol; */
b8c5462f
JH
2370 break;
2371 }
d6a28714
JH
2372 sayNO;
2373 case MBOL:
12d33761
HS
2374 if (locinput == PL_bostr ||
2375 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2376 {
b8c5462f
JH
2377 break;
2378 }
d6a28714
JH
2379 sayNO;
2380 case SBOL:
c2a73568 2381 if (locinput == PL_bostr)
b8c5462f 2382 break;
d6a28714
JH
2383 sayNO;
2384 case GPOS:
2385 if (locinput == PL_reg_ganch)
2386 break;
2387 sayNO;
2388 case EOL:
2389 if (PL_multiline)
2390 goto meol;
2391 else
2392 goto seol;
2393 case MEOL:
2394 meol:
2395 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2396 sayNO;
b8c5462f 2397 break;
d6a28714
JH
2398 case SEOL:
2399 seol:
2400 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2401 sayNO;
d6a28714 2402 if (PL_regeol - locinput > 1)
b8c5462f 2403 sayNO;
b8c5462f 2404 break;
d6a28714
JH
2405 case EOS:
2406 if (PL_regeol != locinput)
b8c5462f 2407 sayNO;
d6a28714 2408 break;
ffc61ed2 2409 case SANY:
d6a28714 2410 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2411 sayNO;
f33976b4
DB
2412 if (do_utf8) {
2413 locinput += PL_utf8skip[nextchr];
2414 if (locinput > PL_regeol)
2415 sayNO;
2416 nextchr = UCHARAT(locinput);
2417 }
2418 else
2419 nextchr = UCHARAT(++locinput);
2420 break;
2421 case CANY:
2422 if (!nextchr && locinput >= PL_regeol)
2423 sayNO;
b8c5462f 2424 nextchr = UCHARAT(++locinput);
a0d0e21e 2425 break;
ffc61ed2 2426 case REG_ANY:
1aa99e6b
IH
2427 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2428 sayNO;
2429 if (do_utf8) {
b8c5462f 2430 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2431 if (locinput > PL_regeol)
2432 sayNO;
a0ed51b3 2433 nextchr = UCHARAT(locinput);
a0ed51b3 2434 }
1aa99e6b
IH
2435 else
2436 nextchr = UCHARAT(++locinput);
a0ed51b3 2437 break;
d6a28714 2438 case EXACT:
cd439c50
IZ
2439 s = STRING(scan);
2440 ln = STR_LEN(scan);
eb160463 2441 if (do_utf8 != UTF) {
bc517b45 2442 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2443 char *l = locinput;
2444 char *e = s + ln;
bc517b45 2445 STRLEN ulen;
a72c7584 2446
5ff6fc6d
JH
2447 if (do_utf8) {
2448 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2449 while (s < e) {
2450 if (l >= PL_regeol)
5ff6fc6d
JH
2451 sayNO;
2452 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2453 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2454 ckWARN(WARN_UTF8) ?
2455 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2456 sayNO;
bc517b45 2457 l += ulen;
5ff6fc6d 2458 s ++;
1aa99e6b 2459 }
5ff6fc6d
JH
2460 }
2461 else {
2462 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2463 while (s < e) {
2464 if (l >= PL_regeol)
2465 sayNO;
5ff6fc6d 2466 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2467 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2468 ckWARN(WARN_UTF8) ?
2469 0 : UTF8_ALLOW_ANY))
1aa99e6b 2470 sayNO;
bc517b45 2471 s += ulen;
a72c7584 2472 l ++;
1aa99e6b 2473 }
5ff6fc6d 2474 }
1aa99e6b
IH
2475 locinput = l;
2476 nextchr = UCHARAT(locinput);
2477 break;
2478 }
bc517b45 2479 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2480 /* Inline the first character, for speed. */
2481 if (UCHARAT(s) != nextchr)
2482 sayNO;
2483 if (PL_regeol - locinput < ln)
2484 sayNO;
2485 if (ln > 1 && memNE(s, locinput, ln))
2486 sayNO;
2487 locinput += ln;
2488 nextchr = UCHARAT(locinput);
2489 break;
2490 case EXACTFL:
b8c5462f
JH
2491 PL_reg_flags |= RF_tainted;
2492 /* FALL THROUGH */
d6a28714 2493 case EXACTF:
cd439c50
IZ
2494 s = STRING(scan);
2495 ln = STR_LEN(scan);
d6a28714 2496
d07ddd77
JH
2497 if (do_utf8 || UTF) {
2498 /* Either target or the pattern are utf8. */
d6a28714 2499 char *l = locinput;
d07ddd77 2500 char *e = PL_regeol;
bc517b45 2501
eb160463 2502 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2503 l, &e, 0, do_utf8)) {
5486206c
JH
2504 /* One more case for the sharp s:
2505 * pack("U0U*", 0xDF) =~ /ss/i,
2506 * the 0xC3 0x9F are the UTF-8
2507 * byte sequence for the U+00DF. */
2508 if (!(do_utf8 &&
2509 toLOWER(s[0]) == 's' &&
2510 ln >= 2 &&
2511 toLOWER(s[1]) == 's' &&
2512 (U8)l[0] == 0xC3 &&
2513 e - l >= 2 &&
2514 (U8)l[1] == 0x9F))
2515 sayNO;
2516 }
d07ddd77
JH
2517 locinput = e;
2518 nextchr = UCHARAT(locinput);
2519 break;
a0ed51b3 2520 }
d6a28714 2521
bc517b45
JH
2522 /* Neither the target and the pattern are utf8. */
2523
d6a28714
JH
2524 /* Inline the first character, for speed. */
2525 if (UCHARAT(s) != nextchr &&
2526 UCHARAT(s) != ((OP(scan) == EXACTF)
2527 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2528 sayNO;
d6a28714 2529 if (PL_regeol - locinput < ln)
b8c5462f 2530 sayNO;
d6a28714
JH
2531 if (ln > 1 && (OP(scan) == EXACTF
2532 ? ibcmp(s, locinput, ln)
2533 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2534 sayNO;
d6a28714
JH
2535 locinput += ln;
2536 nextchr = UCHARAT(locinput);
a0d0e21e 2537 break;
d6a28714 2538 case ANYOF:
ffc61ed2 2539 if (do_utf8) {
9e55ce06
JH
2540 STRLEN inclasslen = PL_regeol - locinput;
2541
ba7b4546 2542 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2543 sayNO_ANYOF;
ffc61ed2
JH
2544 if (locinput >= PL_regeol)
2545 sayNO;
0f0076b4 2546 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2547 nextchr = UCHARAT(locinput);
e0f9d4a8 2548 break;
ffc61ed2
JH
2549 }
2550 else {
2551 if (nextchr < 0)
2552 nextchr = UCHARAT(locinput);
7d3e948e 2553 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2554 sayNO_ANYOF;
ffc61ed2
JH
2555 if (!nextchr && locinput >= PL_regeol)
2556 sayNO;
2557 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2558 break;
2559 }
2560 no_anyof:
2561 /* If we might have the case of the German sharp s
2562 * in a casefolding Unicode character class. */
2563
ebc501f0
JH
2564 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2565 locinput += SHARP_S_SKIP;
e0f9d4a8 2566 nextchr = UCHARAT(locinput);
ffc61ed2 2567 }
e0f9d4a8
JH
2568 else
2569 sayNO;
b8c5462f 2570 break;
d6a28714 2571 case ALNUML:
b8c5462f
JH
2572 PL_reg_flags |= RF_tainted;
2573 /* FALL THROUGH */
d6a28714 2574 case ALNUM:
b8c5462f 2575 if (!nextchr)
4633a7c4 2576 sayNO;
ffc61ed2 2577 if (do_utf8) {
ad24be35 2578 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2579 if (!(OP(scan) == ALNUM
3568d838 2580 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2581 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2582 {
2583 sayNO;
a0ed51b3 2584 }
b8c5462f 2585 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2586 nextchr = UCHARAT(locinput);
2587 break;
2588 }
ffc61ed2 2589 if (!(OP(scan) == ALNUM
d6a28714 2590 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2591 sayNO;
b8c5462f 2592 nextchr = UCHARAT(++locinput);
a0d0e21e 2593 break;
d6a28714 2594 case NALNUML:
b8c5462f
JH
2595 PL_reg_flags |= RF_tainted;
2596 /* FALL THROUGH */
d6a28714
JH
2597 case NALNUM:
2598 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2599 sayNO;
ffc61ed2 2600 if (do_utf8) {
8269fa76 2601 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2602 if (OP(scan) == NALNUM
3568d838 2603 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2604 : isALNUM_LC_utf8((U8*)locinput))
2605 {
b8c5462f 2606 sayNO;
d6a28714 2607 }
b8c5462f
JH
2608 locinput += PL_utf8skip[nextchr];
2609 nextchr = UCHARAT(locinput);
2610 break;
2611 }
ffc61ed2 2612 if (OP(scan) == NALNUM
d6a28714 2613 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2614 sayNO;
76e3520e 2615 nextchr = UCHARAT(++locinput);
a0d0e21e 2616 break;
d6a28714
JH
2617 case BOUNDL:
2618 case NBOUNDL:
3280af22 2619 PL_reg_flags |= RF_tainted;
bbce6d69 2620 /* FALL THROUGH */
d6a28714
JH
2621 case BOUND:
2622 case NBOUND:
2623 /* was last char in word? */
ffc61ed2 2624 if (do_utf8) {
12d33761
HS
2625 if (locinput == PL_bostr)
2626 ln = '\n';
ffc61ed2 2627 else {
b4f7163a 2628 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2629
b4f7163a 2630 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2631 }
2632 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2633 ln = isALNUM_uni(ln);
8269fa76 2634 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2635 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2636 }
2637 else {
9041c2e3 2638 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2639 n = isALNUM_LC_utf8((U8*)locinput);
2640 }
a0ed51b3 2641 }
d6a28714 2642 else {
12d33761
HS
2643 ln = (locinput != PL_bostr) ?
2644 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2645 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2646 ln = isALNUM(ln);
2647 n = isALNUM(nextchr);
2648 }
2649 else {
2650 ln = isALNUM_LC(ln);
2651 n = isALNUM_LC(nextchr);
2652 }
d6a28714 2653 }
ffc61ed2
JH
2654 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2655 OP(scan) == BOUNDL))
2656 sayNO;
a0ed51b3 2657 break;
d6a28714 2658 case SPACEL:
3280af22 2659 PL_reg_flags |= RF_tainted;
bbce6d69 2660 /* FALL THROUGH */
d6a28714 2661 case SPACE:
9442cb0e 2662 if (!nextchr)
4633a7c4 2663 sayNO;
1aa99e6b 2664 if (do_utf8) {
fd400ab9 2665 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2666 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2667 if (!(OP(scan) == SPACE
3568d838 2668 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2669 : isSPACE_LC_utf8((U8*)locinput)))
2670 {
2671 sayNO;
2672 }
2673 locinput += PL_utf8skip[nextchr];
2674 nextchr = UCHARAT(locinput);
2675 break;
d6a28714 2676 }
ffc61ed2
JH
2677 if (!(OP(scan) == SPACE
2678 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2679 sayNO;
2680 nextchr = UCHARAT(++locinput);
2681 }
2682 else {
2683 if (!(OP(scan) == SPACE
2684 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2685 sayNO;
2686 nextchr = UCHARAT(++locinput);
a0ed51b3 2687 }
a0ed51b3 2688 break;
d6a28714 2689 case NSPACEL:
3280af22 2690 PL_reg_flags |= RF_tainted;
bbce6d69 2691 /* FALL THROUGH */
d6a28714 2692 case NSPACE:
9442cb0e 2693 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2694 sayNO;
1aa99e6b 2695 if (do_utf8) {
8269fa76 2696 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2697 if (OP(scan) == NSPACE
3568d838 2698 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2699 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2700 {
2701 sayNO;
2702 }
2703 locinput += PL_utf8skip[nextchr];
2704 nextchr = UCHARAT(locinput);
2705 break;
a0ed51b3 2706 }
ffc61ed2 2707 if (OP(scan) == NSPACE
d6a28714 2708 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2709 sayNO;
76e3520e 2710 nextchr = UCHARAT(++locinput);
a0d0e21e 2711 break;
d6a28714 2712 case DIGITL:
a0ed51b3
LW
2713 PL_reg_flags |= RF_tainted;
2714 /* FALL THROUGH */
d6a28714 2715 case DIGIT:
9442cb0e 2716 if (!nextchr)
a0ed51b3 2717 sayNO;
1aa99e6b 2718 if (do_utf8) {
8269fa76 2719 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2720 if (!(OP(scan) == DIGIT
3568d838 2721 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2722 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2723 {
a0ed51b3 2724 sayNO;
dfe13c55 2725 }
6f06b55f 2726 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2727 nextchr = UCHARAT(locinput);
2728 break;
2729 }
ffc61ed2 2730 if (!(OP(scan) == DIGIT
9442cb0e 2731 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2732 sayNO;
2733 nextchr = UCHARAT(++locinput);
2734 break;
d6a28714 2735 case NDIGITL:
b8c5462f
JH
2736 PL_reg_flags |= RF_tainted;
2737 /* FALL THROUGH */
d6a28714 2738 case NDIGIT:
9442cb0e 2739 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2740 sayNO;
1aa99e6b 2741 if (do_utf8) {
8269fa76 2742 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2743 if (OP(scan) == NDIGIT
3568d838 2744 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2745 : isDIGIT_LC_utf8((U8*)locinput))
2746 {
a0ed51b3 2747 sayNO;
9442cb0e 2748 }
6f06b55f 2749 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2750 nextchr = UCHARAT(locinput);
2751 break;
2752 }
ffc61ed2 2753 if (OP(scan) == NDIGIT
9442cb0e 2754 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2755 sayNO;
2756 nextchr = UCHARAT(++locinput);
2757 break;
2758 case CLUMP:
b7c83a7e 2759 if (locinput >= PL_regeol)
a0ed51b3 2760 sayNO;
b7c83a7e
JH
2761 if (do_utf8) {
2762 LOAD_UTF8_CHARCLASS(mark,"~");
2763 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2764 sayNO;
2765 locinput += PL_utf8skip[nextchr];
2766 while (locinput < PL_regeol &&
2767 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2768 locinput += UTF8SKIP(locinput);
2769 if (locinput > PL_regeol)
2770 sayNO;
eb08e2da
JH
2771 }
2772 else
2773 locinput++;
a0ed51b3
LW
2774 nextchr = UCHARAT(locinput);
2775 break;
c8756f30 2776 case REFFL:
3280af22 2777 PL_reg_flags |= RF_tainted;
c8756f30 2778 /* FALL THROUGH */
c277df42 2779 case REF:
c8756f30 2780 case REFF:
c277df42 2781 n = ARG(scan); /* which paren pair */
cf93c79d 2782 ln = PL_regstartp[n];
2c2d71f5 2783 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2784 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2785 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2786 if (ln == PL_regendp[n])
a0d0e21e 2787 break;
a0ed51b3 2788
cf93c79d 2789 s = PL_bostr + ln;
1aa99e6b 2790 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2791 char *l = locinput;
cf93c79d 2792 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2793 /*
2794 * Note that we can't do the "other character" lookup trick as
2795 * in the 8-bit case (no pun intended) because in Unicode we
2796 * have to map both upper and title case to lower case.
2797 */
2798 if (OP(scan) == REFF) {
a2a2844f 2799 STRLEN ulen1, ulen2;
e7ae6809
JH
2800 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2801 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2802 while (s < e) {
2803 if (l >= PL_regeol)
2804 sayNO;
a2a2844f
JH
2805 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2806 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2807 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2808 sayNO;
a2a2844f
JH
2809 s += ulen1;
2810 l += ulen2;
a0ed51b3
LW
2811 }
2812 }
2813 locinput = l;
2814 nextchr = UCHARAT(locinput);
2815 break;
2816 }
2817
a0d0e21e 2818 /* Inline the first character, for speed. */
76e3520e 2819 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2820 (OP(scan) == REF ||
2821 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2822 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2823 sayNO;
cf93c79d 2824 ln = PL_regendp[n] - ln;
3280af22 2825 if (locinput + ln > PL_regeol)
4633a7c4 2826 sayNO;
c8756f30
AK
2827 if (ln > 1 && (OP(scan) == REF
2828 ? memNE(s, locinput, ln)
2829 : (OP(scan) == REFF
2830 ? ibcmp(s, locinput, ln)
2831 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2832 sayNO;
a0d0e21e 2833 locinput += ln;
76e3520e 2834 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2835 break;
2836
2837 case NOTHING:
c277df42 2838 case TAIL:
a0d0e21e
LW
2839 break;
2840 case BACK:
2841 break;
c277df42
IZ
2842 case EVAL:
2843 {
2844 dSP;
533c011a 2845 OP_4tree *oop = PL_op;
3280af22 2846 COP *ocurcop = PL_curcop;
f3548bdc 2847 PAD *old_comppad;
c277df42 2848 SV *ret;
080c2dec 2849 struct regexp *oreg = PL_reg_re;
9041c2e3 2850
c277df42 2851 n = ARG(scan);
533c011a 2852 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2853 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2854 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2855 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2856
8e5e9ebe
RGS
2857 {
2858 SV **before = SP;
2859 CALLRUNOPS(aTHX); /* Scalar context. */
2860 SPAGAIN;
2861 if (SP == before)
075aa684 2862 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2863 else {
2864 ret = POPs;
2865 PUTBACK;
2866 }
2867 }
2868
0f5d15d6 2869 PL_op = oop;
f3548bdc 2870 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2871 PL_curcop = ocurcop;
c277df42 2872 if (logical) {
0f5d15d6
IZ
2873 if (logical == 2) { /* Postponed subexpression. */
2874 regexp *re;
22c35a8c 2875 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2876 re_cc_state state;
0f5d15d6 2877 CHECKPOINT cp, lastcp;
cb50f42d 2878 int toggleutf;
faf82a0b 2879 register SV *sv;
0f5d15d6 2880
faf82a0b
AE
2881 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2882 mg = mg_find(sv, PERL_MAGIC_qr);
2883 else if (SvSMAGICAL(ret)) {
2884 if (SvGMAGICAL(ret))
2885 sv_unmagic(ret, PERL_MAGIC_qr);
2886 else
2887 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2888 }
faf82a0b 2889
0f5d15d6
IZ
2890 if (mg) {
2891 re = (regexp *)mg->mg_obj;
df0003d4 2892 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2893 }
2894 else {
2895 STRLEN len;
2896 char *t = SvPV(ret, len);
2897 PMOP pm;
2898 char *oprecomp = PL_regprecomp;
2899 I32 osize = PL_regsize;
2900 I32 onpar = PL_regnpar;
2901
5fcd1c1b 2902 Zero(&pm, 1, PMOP);
cb50f42d 2903 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2904 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2905 if (!(SvFLAGS(ret)
faf82a0b
AE
2906 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2907 | SVs_GMG)))
14befaf4
DM
2908 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2909 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2910 PL_regprecomp = oprecomp;
2911 PL_regsize = osize;
2912 PL_regnpar = onpar;
2913 }
2914 DEBUG_r(
9041c2e3 2915 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2916 "Entering embedded `%s%.60s%s%s'\n",
2917 PL_colors[0],
2918 re->precomp,
2919 PL_colors[1],
2920 (strlen(re->precomp) > 60 ? "..." : ""))
2921 );
2922 state.node = next;
2923 state.prev = PL_reg_call_cc;
2924 state.cc = PL_regcc;
2925 state.re = PL_reg_re;
2926
2ab05381 2927 PL_regcc = 0;
9041c2e3 2928
0f5d15d6 2929 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2930 REGCP_SET(lastcp);
0f5d15d6
IZ
2931 cache_re(re);
2932 state.ss = PL_savestack_ix;
2933 *PL_reglastparen = 0;
a01268b5 2934 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2935 PL_reg_call_cc = &state;
2936 PL_reginput = locinput;
cb50f42d
YST
2937 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2938 ((re->reganch & ROPT_UTF8) != 0);
2939 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2940
2941 /* XXXX This is too dramatic a measure... */
2942 PL_reg_maxiter = 0;
2943
0f5d15d6 2944 if (regmatch(re->program + 1)) {
2c914db6
IZ
2945 /* Even though we succeeded, we need to restore
2946 global variables, since we may be wrapped inside
2947 SUSPEND, thus the match may be not finished yet. */
2948
2949 /* XXXX Do this only if SUSPENDed? */
2950 PL_reg_call_cc = state.prev;
2951 PL_regcc = state.cc;
2952 PL_reg_re = state.re;
2953 cache_re(PL_reg_re);
cb50f42d 2954 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2955
2956 /* XXXX This is too dramatic a measure... */
2957 PL_reg_maxiter = 0;
2958
2959 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2960 ReREFCNT_dec(re);
2961 regcpblow(cp);
2962 sayYES;
2963 }
0f5d15d6 2964 ReREFCNT_dec(re);
02db2b7b 2965 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2966 regcppop();
2967 PL_reg_call_cc = state.prev;
2968 PL_regcc = state.cc;
2969 PL_reg_re = state.re;
d3790889 2970 cache_re(PL_reg_re);
cb50f42d 2971 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2972
2973 /* XXXX This is too dramatic a measure... */
2974 PL_reg_maxiter = 0;
2975
8e514ae6 2976 logical = 0;
0f5d15d6
IZ
2977 sayNO;
2978 }
c277df42 2979 sw = SvTRUE(ret);
0f5d15d6 2980 logical = 0;
a0ed51b3 2981 }
080c2dec 2982 else {
3280af22 2983 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
2984 cache_re(oreg);
2985 }
c277df42
IZ
2986 break;
2987 }
a0d0e21e 2988 case OPEN:
c277df42 2989 n = ARG(scan); /* which paren pair */
3280af22
NIS
2990 PL_reg_start_tmp[n] = locinput;
2991 if (n > PL_regsize)
2992 PL_regsize = n;
a0d0e21e
LW
2993 break;
2994 case CLOSE:
c277df42 2995 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2996 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2997 PL_regendp[n] = locinput - PL_bostr;
eb160463 2998 if (n > (I32)*PL_reglastparen)
3280af22 2999 *PL_reglastparen = n;
a01268b5 3000 *PL_reglastcloseparen = n;
a0d0e21e 3001 break;
c277df42
IZ
3002 case GROUPP:
3003 n = ARG(scan); /* which paren pair */
eb160463 3004 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3005 break;
3006 case IFTHEN:
2c2d71f5 3007 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3008 if (sw)
3009 next = NEXTOPER(NEXTOPER(scan));
3010 else {
3011 next = scan + ARG(scan);
3012 if (OP(next) == IFTHEN) /* Fake one. */
3013 next = NEXTOPER(NEXTOPER(next));
3014 }
3015 break;
3016 case LOGICAL:
0f5d15d6 3017 logical = scan->flags;
c277df42 3018 break;
2ab05381
IZ
3019/*******************************************************************
3020 PL_regcc contains infoblock about the innermost (...)* loop, and
3021 a pointer to the next outer infoblock.
3022
3023 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3024
3025 1) After matching X, regnode for CURLYX is processed;
3026
9041c2e3 3027 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3028 regmatch() recursively with the starting point at WHILEM node;
3029
3030 3) Each hit of WHILEM node tries to match A and Z (in the order
3031 depending on the current iteration, min/max of {min,max} and
3032 greediness). The information about where are nodes for "A"
3033 and "Z" is read from the infoblock, as is info on how many times "A"
3034 was already matched, and greediness.
3035
3036 4) After A matches, the same WHILEM node is hit again.
3037
3038 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3039 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3040 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3041 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3042 of the external loop.
3043
3044 Currently present infoblocks form a tree with a stem formed by PL_curcc
3045 and whatever it mentions via ->next, and additional attached trees
3046 corresponding to temporarily unset infoblocks as in "5" above.
3047
9041c2e3 3048 In the following picture infoblocks for outer loop of
2ab05381
IZ
3049 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3050 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3051 infoblocks are drawn below the "reset" infoblock.
3052
3053 In fact in the picture below we do not show failed matches for Z and T
3054 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3055 more obvious *why* one needs to *temporary* unset infoblocks.]
3056
3057 Matched REx position InfoBlocks Comment
3058 (Y(A)*?Z)*?T x
3059 Y(A)*?Z)*?T x <- O
3060 Y (A)*?Z)*?T x <- O
3061 Y A)*?Z)*?T x <- O <- I
3062 YA )*?Z)*?T x <- O <- I
3063 YA A)*?Z)*?T x <- O <- I
3064 YAA )*?Z)*?T x <- O <- I
3065 YAA Z)*?T x <- O # Temporary unset I
3066 I
3067
3068 YAAZ Y(A)*?Z)*?T x <- O
3069 I
3070
3071 YAAZY (A)*?Z)*?T x <- O
3072 I
3073
3074 YAAZY A)*?Z)*?T x <- O <- I
3075 I
3076
3077 YAAZYA )*?Z)*?T x <- O <- I
3078 I
3079
3080 YAAZYA Z)*?T x <- O # Temporary unset I
3081 I,I
3082
3083 YAAZYAZ )*?T x <- O
3084 I,I
3085
3086 YAAZYAZ T x # Temporary unset O
3087 O
3088 I,I
3089
3090 YAAZYAZT x
3091 O
3092 I,I
3093 *******************************************************************/
a0d0e21e
LW
3094 case CURLYX: {
3095 CURCUR cc;
3280af22 3096 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3097 /* No need to save/restore up to this paren */
3098 I32 parenfloor = scan->flags;
c277df42
IZ
3099
3100 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3101 next += ARG(next);
3280af22
NIS
3102 cc.oldcc = PL_regcc;
3103 PL_regcc = &cc;
cb434fcc
IZ
3104 /* XXXX Probably it is better to teach regpush to support
3105 parenfloor > PL_regsize... */
eb160463 3106 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3107 parenfloor = *PL_reglastparen; /* Pessimization... */
3108 cc.parenfloor = parenfloor;
a0d0e21e
LW
3109 cc.cur = -1;
3110 cc.min = ARG1(scan);
3111 cc.max = ARG2(scan);
c277df42 3112 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3113 cc.next = next;
3114 cc.minmod = minmod;
3115 cc.lastloc = 0;
3280af22 3116 PL_reginput = locinput;
a0d0e21e
LW
3117 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3118 regcpblow(cp);
3280af22 3119 PL_regcc = cc.oldcc;
4633a7c4 3120 saySAME(n);
a0d0e21e
LW
3121 }
3122 /* NOT REACHED */
3123 case WHILEM: {
3124 /*
3125 * This is really hard to understand, because after we match
3126 * what we're trying to match, we must make sure the rest of
2c2d71f5 3127 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3128 * to go back UP the parse tree by recursing ever deeper. And
3129 * if it fails, we have to reset our parent's current state
3130 * that we can try again after backing off.
3131 */
3132
c277df42 3133 CHECKPOINT cp, lastcp;
3280af22 3134 CURCUR* cc = PL_regcc;
c277df42
IZ
3135 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3136
4633a7c4 3137 n = cc->cur + 1; /* how many we know we matched */
3280af22 3138 PL_reginput = locinput;
a0d0e21e 3139
c277df42 3140 DEBUG_r(
9041c2e3 3141 PerlIO_printf(Perl_debug_log,
91f3b821 3142 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3143 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3144 (long)n, (long)cc->min,
2797576d 3145 (long)cc->max, PTR2UV(cc))
c277df42 3146 );
4633a7c4 3147
a0d0e21e
LW
3148 /* If degenerate scan matches "", assume scan done. */
3149
579cf2c3 3150 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3151 PL_regcc = cc->oldcc;
2ab05381
IZ
3152 if (PL_regcc)
3153 ln = PL_regcc->cur;
c277df42 3154 DEBUG_r(
c3464db5
DD
3155 PerlIO_printf(Perl_debug_log,
3156 "%*s empty match detected, try continuation...\n",
3280af22 3157 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3158 );
a0d0e21e 3159 if (regmatch(cc->next))
4633a7c4 3160 sayYES;
2ab05381
IZ
3161 if (PL_regcc)
3162 PL_regcc->cur = ln;
3280af22 3163 PL_regcc = cc;
4633a7c4 3164 sayNO;
a0d0e21e
LW
3165 }
3166
3167 /* First just match a string of min scans. */
3168
3169 if (n < cc->min) {
3170 cc->cur = n;
3171 cc->lastloc = locinput;
4633a7c4
LW
3172 if (regmatch(cc->scan))
3173 sayYES;
3174 cc->cur = n - 1;
c277df42 3175 cc->lastloc = lastloc;
4633a7c4 3176 sayNO;
a0d0e21e
LW
3177 }
3178
2c2d71f5
JH
3179 if (scan->flags) {
3180 /* Check whether we already were at this position.
3181 Postpone detection until we know the match is not
3182 *that* much linear. */
3183 if (!PL_reg_maxiter) {
3184 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3185 PL_reg_leftiter = PL_reg_maxiter;
3186 }
3187 if (PL_reg_leftiter-- == 0) {
3188 I32 size = (PL_reg_maxiter + 7)/8;
3189 if (PL_reg_poscache) {
eb160463 3190 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3191 Renew(PL_reg_poscache, size, char);
3192 PL_reg_poscache_size = size;
3193 }
3194 Zero(PL_reg_poscache, size, char);
3195 }
3196 else {
3197 PL_reg_poscache_size = size;
3198 Newz(29, PL_reg_poscache, size, char);
3199 }
3200 DEBUG_r(
3201 PerlIO_printf(Perl_debug_log,
3202 "%sDetected a super-linear match, switching on caching%s...\n",
3203 PL_colors[4], PL_colors[5])
3204 );
3205 }
3206 if (PL_reg_leftiter < 0) {
3207 I32 o = locinput - PL_bostr, b;
3208
3209 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3210 b = o % 8;
3211 o /= 8;
3212 if (PL_reg_poscache[o] & (1<<b)) {
3213 DEBUG_r(
3214 PerlIO_printf(Perl_debug_log,
3215 "%*s already tried at this position...\n",
3216 REPORT_CODE_OFF+PL_regindent*2, "")
3217 );
c2b0868c
HS
3218 if (PL_reg_flags & RF_false)
3219 sayYES;
3220 else
3221 sayNO_SILENT;
2c2d71f5
JH
3222 }
3223 PL_reg_poscache[o] |= (1<<b);
3224 }
3225 }
3226
a0d0e21e
LW
3227 /* Prefer next over scan for minimal matching. */
3228
3229 if (cc->minmod) {
3280af22 3230 PL_regcc = cc->oldcc;
2ab05381
IZ
3231 if (PL_regcc)
3232 ln = PL_regcc->cur;
5f05dabc 3233 cp = regcppush(cc->parenfloor);
02db2b7b 3234 REGCP_SET(lastcp);
5f05dabc 3235 if (regmatch(cc->next)) {
c277df42 3236 regcpblow(cp);
4633a7c4 3237 sayYES; /* All done. */
5f05dabc 3238 }
02db2b7b 3239 REGCP_UNWIND(lastcp);
5f05dabc 3240 regcppop();
2ab05381
IZ
3241 if (PL_regcc)
3242 PL_regcc->cur = ln;
3280af22 3243 PL_regcc = cc;
a0d0e21e 3244
c277df42 3245 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3246 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3247 && !(PL_reg_flags & RF_warned)) {
3248 PL_reg_flags |= RF_warned;
9014280d 3249 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3250 "Complex regular subexpression recursion",
3251 REG_INFTY - 1);
c277df42 3252 }
4633a7c4 3253 sayNO;
c277df42 3254 }
a687059c 3255
c277df42 3256 DEBUG_r(
c3464db5
DD
3257 PerlIO_printf(Perl_debug_log,
3258 "%*s trying longer...\n",
3280af22 3259 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3260 );
a0d0e21e 3261 /* Try scanning more and see if it helps. */
3280af22 3262 PL_reginput = locinput;
a0d0e21e
LW
3263 cc->cur = n;
3264 cc->lastloc = locinput;
5f05dabc 3265 cp = regcppush(cc->parenfloor);
02db2b7b 3266 REGCP_SET(lastcp);
5f05dabc 3267 if (regmatch(cc->scan)) {
c277df42 3268 regcpblow(cp);
4633a7c4 3269 sayYES;
5f05dabc 3270 }
02db2b7b 3271 REGCP_UNWIND(lastcp);
5f05dabc 3272 regcppop();
4633a7c4 3273 cc->cur = n - 1;
c277df42 3274 cc->lastloc = lastloc;
4633a7c4 3275 sayNO;
a0d0e21e
LW
3276 }
3277
3278 /* Prefer scan over next for maximal matching. */
3279
3280 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3281 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3282 cc->cur = n;
3283 cc->lastloc = locinput;
02db2b7b 3284 REGCP_SET(lastcp);
5f05dabc 3285 if (regmatch(cc->scan)) {
c277df42 3286 regcpblow(cp);
4633a7c4 3287 sayYES;
5f05dabc 3288 }
02db2b7b 3289 REGCP_UNWIND(lastcp);
a0d0e21e 3290 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3291 PL_reginput = locinput;
c277df42 3292 DEBUG_r(
c3464db5
DD
3293 PerlIO_printf(Perl_debug_log,
3294 "%*s failed, try continuation...\n",
3280af22 3295 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3296 );
3297 }
9041c2e3 3298 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3299 && !(PL_reg_flags & RF_warned)) {
3280af22 3300 PL_reg_flags |= RF_warned;
9014280d 3301 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3302 "Complex regular subexpression recursion",
3303 REG_INFTY - 1);
a0d0e21e
LW
3304 }
3305
3306 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3307 PL_regcc = cc->oldcc;
2ab05381
IZ
3308 if (PL_regcc)
3309 ln = PL_regcc->cur;
a0d0e21e 3310 if (regmatch(cc->next))
4633a7c4 3311 sayYES;
2ab05381
IZ
3312 if (PL_regcc)
3313 PL_regcc->cur = ln;
3280af22 3314 PL_regcc = cc;
4633a7c4 3315 cc->cur = n - 1;
c277df42 3316 cc->lastloc = lastloc;
4633a7c4 3317 sayNO;
a0d0e21e
LW
3318 }
3319 /* NOT REACHED */
9041c2e3 3320 case BRANCHJ:
c277df42
IZ
3321 next = scan + ARG(scan);
3322 if (next == scan)
3323 next = NULL;
3324 inner = NEXTOPER(NEXTOPER(scan));
3325 goto do_branch;
9041c2e3 3326 case BRANCH:
c277df42
IZ
3327 inner = NEXTOPER(scan);
3328 do_branch:
3329 {
c277df42
IZ
3330 c1 = OP(scan);
3331 if (OP(next) != c1) /* No choice. */
3332 next = inner; /* Avoid recursion. */
a0d0e21e 3333 else {
02db2b7b
IZ
3334 I32 lastparen = *PL_reglastparen;
3335 I32 unwind1;
3336 re_unwind_branch_t *uw;
3337
3338 /* Put unwinding data on stack */
3339 unwind1 = SSNEWt(1,re_unwind_branch_t);
3340 uw = SSPTRt(unwind1,re_unwind_branch_t);
3341 uw->prev = unwind;
3342 unwind = unwind1;
3343 uw->type = ((c1 == BRANCH)
3344 ? RE_UNWIND_BRANCH
3345 : RE_UNWIND_BRANCHJ);
3346 uw->lastparen = lastparen;
3347 uw->next = next;
3348 uw->locinput = locinput;
3349 uw->nextchr = nextchr;
3350#ifdef DEBUGGING
3351 uw->regindent = ++PL_regindent;
3352#endif
c277df42 3353
02db2b7b
IZ
3354 REGCP_SET(uw->lastcp);
3355
3356 /* Now go into the first branch */
3357 next = inner;
a687059c 3358 }
a0d0e21e
LW
3359 }
3360 break;
3361 case MINMOD:
3362 minmod = 1;
3363 break;
c277df42
IZ
3364 case CURLYM:
3365 {
00db4c45 3366 I32 l = 0;
c277df42 3367 CHECKPOINT lastcp;
9041c2e3 3368
c277df42
IZ
3369 /* We suppose that the next guy does not need
3370 backtracking: in particular, it is of constant length,
3371 and has no parenths to influence future backrefs. */
3372 ln = ARG1(scan); /* min to match */
3373 n = ARG2(scan); /* max to match */
c277df42
IZ
3374 paren = scan->flags;
3375 if (paren) {
3280af22
NIS
3376 if (paren > PL_regsize)
3377 PL_regsize = paren;
eb160463 3378 if (paren > (I32)*PL_reglastparen)
3280af22 3379 *PL_reglastparen = paren;
c277df42 3380 }
dc45a647 3381 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3382 if (paren)
3383 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3384 PL_reginput = locinput;
c277df42
IZ
3385 if (minmod) {
3386 minmod = 0;
3387 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3388 sayNO;
f31a99c8
HS
3389 /* if we matched something zero-length we don't need to
3390 backtrack - capturing parens are already defined, so
3391 the caveat in the maximal case doesn't apply
3392
3393 XXXX if ln == 0, we can redo this check first time
3394 through the following loop
3395 */
3396 if (ln && l == 0)
3397 n = ln; /* don't backtrack */
3280af22 3398 locinput = PL_reginput;
cca55fe3 3399 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3400 regnode *text_node = next;
3401
cca55fe3 3402 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3403
cca55fe3 3404 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3405 else {
cca55fe3 3406 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3407 c1 = c2 = -1000;
3408 goto assume_ok_MM;
cca55fe3
JP
3409 }
3410 else { c1 = (U8)*STRING(text_node); }
af5decee 3411 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3412 c2 = PL_fold[c1];
af5decee 3413 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3414 c2 = PL_fold_locale[c1];
3415 else
3416 c2 = c1;
3417 }
a0ed51b3
LW
3418 }
3419 else
c277df42 3420 c1 = c2 = -1000;
cca55fe3 3421 assume_ok_MM:
02db2b7b 3422 REGCP_SET(lastcp);
5f4b28b2 3423 /* This may be improved if l == 0. */
c277df42
IZ
3424 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3425 /* If it could work, try it. */
3426 if (c1 == -1000 ||
3280af22
NIS
3427 UCHARAT(PL_reginput) == c1 ||
3428 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3429 {
3430 if (paren) {
f31a99c8 3431 if (ln) {
cf93c79d
IZ
3432 PL_regstartp[paren] =
3433 HOPc(PL_reginput, -l) - PL_bostr;
3434 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3435 }
3436 else
cf93c79d 3437 PL_regendp[paren] = -1;
c277df42
IZ
3438 }
3439 if (regmatch(next))
3440 sayYES;
02db2b7b 3441 REGCP_UNWIND(lastcp);
c277df42
IZ
3442 }
3443 /* Couldn't or didn't -- move forward. */
3280af22 3444 PL_reginput = locinput;
c277df42
IZ
3445 if (regrepeat_hard(scan, 1, &l)) {
3446 ln++;
3280af22 3447 locinput = PL_reginput;
c277df42
IZ
3448 }
3449 else
3450 sayNO;
3451 }
a0ed51b3
LW
3452 }
3453 else {
c277df42 3454 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3455 /* if we matched something zero-length we don't need to
3456 backtrack, unless the minimum count is zero and we
3457 are capturing the result - in that case the capture
3458 being defined or not may affect later execution
3459 */
3460 if (n != 0 && l == 0 && !(paren && ln == 0))
3461 ln = n; /* don't backtrack */
3280af22 3462 locinput = PL_reginput;
c277df42 3463 DEBUG_r(
5c0ca799 3464 PerlIO_printf(Perl_debug_log,
faccc32b 3465 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3466 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3467 (IV) n, (IV)l)
c277df42
IZ
3468 );
3469 if (n >= ln) {
cca55fe3 3470 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3471 regnode *text_node = next;
3472
cca55fe3 3473 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3474
cca55fe3 3475 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3476 else {
cca55fe3 3477 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3478 c1 = c2 = -1000;
3479 goto assume_ok_REG;
cca55fe3
JP
3480 }
3481 else { c1 = (U8)*STRING(text_node); }
3482
af5decee 3483 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3484 c2 = PL_fold[c1];
af5decee 3485 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3486 c2 = PL_fold_locale[c1];
3487 else
3488 c2 = c1;
3489 }
a0ed51b3
LW
3490 }
3491 else
c277df42
IZ
3492 c1 = c2 = -1000;
3493 }
cca55fe3 3494 assume_ok_REG:
02db2b7b 3495 REGCP_SET(lastcp);
c277df42
IZ
3496 while (n >= ln) {
3497 /* If it could work, try it. */
3498 if (c1 == -1000 ||
3280af22
NIS
3499 UCHARAT(PL_reginput) == c1 ||
3500 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3501 {
3502 DEBUG_r(
c3464db5 3503 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3504 "%*s trying tail with n=%"IVdf"...\n",
3505 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3506 );
3507 if (paren) {
3508 if (n) {
cf93c79d
IZ
3509 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3510 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3511 }
a0ed51b3 3512 else
cf93c79d 3513 PL_regendp[paren] = -1;
c277df42 3514 }
a0ed51b3
LW
3515 if (regmatch(next))
3516 sayYES;
02db2b7b 3517 REGCP_UNWIND(lastcp);
a0ed51b3 3518 }
c277df42
IZ
3519 /* Couldn't or didn't -- back up. */
3520 n--;
dfe13c55 3521 locinput = HOPc(locinput, -l);
3280af22 3522 PL_reginput = locinput;
c277df42
IZ
3523 }
3524 }
3525 sayNO;
3526 break;
3527 }
3528 case CURLYN:
3529 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3530 if (paren > PL_regsize)
3531 PL_regsize = paren;
eb160463 3532 if (paren > (I32)*PL_reglastparen)
3280af22 3533 *PL_reglastparen = paren;
c277df42
IZ
3534 ln = ARG1(scan); /* min to match */
3535 n = ARG2(scan); /* max to match */
dc45a647 3536 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3537 goto repeat;
a0d0e21e 3538 case CURLY:
c277df42 3539 paren = 0;
a0d0e21e
LW
3540 ln = ARG1(scan); /* min to match */
3541 n = ARG2(scan); /* max to match */
dc45a647 3542 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3543 goto repeat;
3544 case STAR:
3545 ln = 0;
c277df42 3546 n = REG_INFTY;
a0d0e21e 3547 scan = NEXTOPER(scan);
c277df42 3548 paren = 0;
a0d0e21e
LW
3549 goto repeat;
3550 case PLUS:
c277df42
IZ
3551 ln = 1;
3552 n = REG_INFTY;
3553 scan = NEXTOPER(scan);
3554 paren = 0;
3555 repeat:
a0d0e21e
LW
3556 /*
3557 * Lookahead to avoid useless match attempts
3558 * when we know what character comes next.
3559 */
5f80c4cf
JP
3560
3561 /*
3562 * Used to only do .*x and .*?x, but now it allows
3563 * for )'s, ('s and (?{ ... })'s to be in the way
3564 * of the quantifier and the EXACT-like node. -- japhy
3565 */
3566
cca55fe3 3567 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3568 U8 *s;
3569 regnode *text_node = next;
3570
cca55fe3 3571 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3572
cca55fe3 3573 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3574 else {
cca55fe3 3575 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3576 c1 = c2 = -1000;
3577 goto assume_ok_easy;
cca55fe3
JP
3578 }
3579 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3580
3581 if (!UTF) {
3582 c2 = c1 = *s;
f65d3ee7 3583 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3584 c2 = PL_fold[c1];
f65d3ee7 3585 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3586 c2 = PL_fold_locale[c1];
1aa99e6b 3587 }
5f80c4cf 3588 else { /* UTF */
f65d3ee7 3589 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3590 STRLEN ulen1, ulen2;
e7ae6809
JH
3591 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3592 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3593
3594 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3595 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3596
872c91ae
JH
3597 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3598 ckWARN(WARN_UTF8) ?
3599 0 : UTF8_ALLOW_ANY);
3600 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3601 ckWARN(WARN_UTF8) ?
3602 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3603 }
3604 else {
872c91ae
JH
3605 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3606 ckWARN(WARN_UTF8) ?
3607 0 : UTF8_ALLOW_ANY);
5f80c4cf 3608 }
1aa99e6b
IH
3609 }
3610 }
bbce6d69 3611 }
a0d0e21e 3612 else
bbce6d69 3613 c1 = c2 = -1000;
cca55fe3 3614 assume_ok_easy:
3280af22 3615 PL_reginput = locinput;
a0d0e21e 3616 if (minmod) {
c277df42 3617 CHECKPOINT lastcp;
a0d0e21e
LW
3618 minmod = 0;
3619 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3620 sayNO;
a0ed51b3 3621 locinput = PL_reginput;
02db2b7b 3622 REGCP_SET(lastcp);
0fe9bf95 3623 if (c1 != -1000) {
1aa99e6b 3624 char *e; /* Should not check after this */
0fe9bf95 3625 char *old = locinput;
b2f2f093 3626 int count = 0;
0fe9bf95 3627
1aa99e6b 3628 if (n == REG_INFTY) {
0fe9bf95 3629 e = PL_regeol - 1;
1aa99e6b
IH
3630 if (do_utf8)
3631 while (UTF8_IS_CONTINUATION(*(U8*)e))
3632 e--;
3633 }
3634 else if (do_utf8) {
3635 int m = n - ln;
3636 for (e = locinput;
3637 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3638 e += UTF8SKIP(e);
3639 }
3640 else {
3641 e = locinput + n - ln;
3642 if (e >= PL_regeol)
3643 e = PL_regeol - 1;
3644 }
0fe9bf95
IZ
3645 while (1) {
3646 /* Find place 'next' could work */
1aa99e6b
IH
3647 if (!do_utf8) {
3648 if (c1 == c2) {
a8e8ab15
JH
3649 while (locinput <= e &&
3650 UCHARAT(locinput) != c1)
1aa99e6b
IH
3651 locinput++;
3652 } else {
9041c2e3 3653 while (locinput <= e
a8e8ab15
JH
3654 && UCHARAT(locinput) != c1
3655 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3656 locinput++;
3657 }
3658 count = locinput - old;
3659 }
3660 else {
3661 STRLEN len;
3662 if (c1 == c2) {
872c91ae
JH
3663 /* count initialised to
3664 * utf8_distance(old, locinput) */
b2f2f093 3665 while (locinput <= e &&
872c91ae
JH
3666 utf8n_to_uvchr((U8*)locinput,
3667 UTF8_MAXLEN, &len,
3668 ckWARN(WARN_UTF8) ?
eb160463 3669 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3670 locinput += len;
b2f2f093
JH
3671 count++;
3672 }
1aa99e6b 3673 } else {
872c91ae
JH
3674 /* count initialised to
3675 * utf8_distance(old, locinput) */
b2f2f093 3676 while (locinput <= e) {
872c91ae
JH
3677 UV c = utf8n_to_uvchr((U8*)locinput,
3678 UTF8_MAXLEN, &len,
3679 ckWARN(WARN_UTF8) ?
3680 0 : UTF8_ALLOW_ANY);
eb160463 3681 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3682 break;
b2f2f093
JH
3683 locinput += len;
3684 count++;
1aa99e6b
IH
3685 }
3686 }
0fe9bf95 3687 }
9041c2e3 3688 if (locinput > e)
0fe9bf95
IZ
3689 sayNO;
3690 /* PL_reginput == old now */
3691 if (locinput != old) {
3692 ln = 1; /* Did some */
1aa99e6b 3693 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3694 sayNO;
3695 }
3696 /* PL_reginput == locinput now */
29d1e993 3697 TRYPAREN(paren, ln, locinput);
0fe9bf95 3698 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3699 REGCP_UNWIND(lastcp);
0fe9bf95 3700 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3701 old = locinput;
3702 if (do_utf8)
3703 locinput += UTF8SKIP(locinput);
3704 else
3705 locinput++;
b2f2f093 3706 count = 1;
0fe9bf95
IZ
3707 }
3708 }
3709 else
c277df42 3710 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3711 UV c;
3712 if (c1 != -1000) {
3713 if (do_utf8)
872c91ae
JH
3714 c = utf8n_to_uvchr((U8*)PL_reginput,
3715 UTF8_MAXLEN, 0,
3716 ckWARN(WARN_UTF8) ?
3717 0 : UTF8_ALLOW_ANY);
1aa99e6b 3718 else
9041c2e3 3719 c = UCHARAT(PL_reginput);
2390ecbc 3720 /* If it could work, try it. */
eb160463 3721 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3722 {
ecc99935 3723 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3724 REGCP_UNWIND(lastcp);
3725 }
1aa99e6b 3726 }
a0d0e21e 3727 /* If it could work, try it. */
2390ecbc 3728 else if (c1 == -1000)
bbce6d69 3729 {
ecc99935 3730 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3731 REGCP_UNWIND(lastcp);
bbce6d69 3732 }
c277df42 3733 /* Couldn't or didn't -- move forward. */
a0ed51b3 3734 PL_reginput = locinput;
a0d0e21e
LW
3735 if (regrepeat(scan, 1)) {
3736 ln++;
a0ed51b3
LW
3737 locinput = PL_reginput;
3738 }
3739 else
4633a7c4 3740 sayNO;
a0d0e21e
LW
3741 }
3742 }
3743 else {
c277df42 3744 CHECKPOINT lastcp;
a0d0e21e 3745 n = regrepeat(scan, n);
a0ed51b3 3746 locinput = PL_reginput;
22c35a8c 3747 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3748 ((!PL_multiline && OP(next) != MEOL) ||
3749 OP(next) == SEOL || OP(next) == EOS))
3750 {
a0d0e21e 3751 ln = n; /* why back off? */
1aeab75a
GS
3752 /* ...because $ and \Z can match before *and* after
3753 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3754 We should back off by one in this case. */
3755 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3756 ln--;
3757 }
02db2b7b 3758 REGCP_SET(lastcp);
c277df42 3759 if (paren) {
8fa7f367 3760 UV c = 0;
c277df42 3761 while (n >= ln) {
1aa99e6b
IH
3762 if (c1 != -1000) {
3763 if (do_utf8)
872c91ae
JH
3764 c = utf8n_to_uvchr((U8*)PL_reginput,
3765 UTF8_MAXLEN, 0,
3766 ckWARN(WARN_UTF8) ?
3767 0 : UTF8_ALLOW_ANY);
1aa99e6b 3768 else
9041c2e3 3769 c = UCHARAT(PL_reginput);
1aa99e6b 3770 }
c277df42 3771 /* If it could work, try it. */
eb160463 3772 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3773 {
29d1e993 3774 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3775 REGCP_UNWIND(lastcp);
c277df42
IZ
3776 }
3777 /* Couldn't or didn't -- back up. */
3778 n--;
dfe13c55 3779 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3780 }
a0ed51b3
LW
3781 }
3782 else {
8fa7f367 3783 UV c = 0;
c277df42 3784 while (n >= ln) {
1aa99e6b
IH
3785 if (c1 != -1000) {
3786 if (do_utf8)
872c91ae
JH
3787 c = utf8n_to_uvchr((U8*)PL_reginput,
3788 UTF8_MAXLEN, 0,
3789 ckWARN(WARN_UTF8) ?
3790 0 : UTF8_ALLOW_ANY);
1aa99e6b 3791 else
9041c2e3 3792 c = UCHARAT(PL_reginput);
1aa99e6b 3793 }
c277df42 3794 /* If it could work, try it. */
eb160463 3795 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3796 {
29d1e993 3797 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3798 REGCP_UNWIND(lastcp);
c277df42
IZ
3799 }
3800 /* Couldn't or didn't -- back up. */
3801 n--;
dfe13c55 3802 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3803 }
a0d0e21e
LW
3804 }
3805 }
4633a7c4 3806 sayNO;
c277df42 3807 break;
a0d0e21e 3808 case END:
0f5d15d6
IZ
3809 if (PL_reg_call_cc) {
3810 re_cc_state *cur_call_cc = PL_reg_call_cc;
3811 CURCUR *cctmp = PL_regcc;
3812 regexp *re = PL_reg_re;
3813 CHECKPOINT cp, lastcp;
3814
3815 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3816 REGCP_SET(lastcp);
0f5d15d6
IZ
3817 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3818 the caller. */
3819 PL_reginput = locinput; /* Make position available to
3820 the callcc. */
3821 cache_re(PL_reg_call_cc->re);
3822 PL_regcc = PL_reg_call_cc->cc;
3823 PL_reg_call_cc = PL_reg_call_cc->prev;
3824 if (regmatch(cur_call_cc->node)) {
3825 PL_reg_call_cc = cur_call_cc;
3826 regcpblow(cp);
3827 sayYES;
3828 }
02db2b7b 3829 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3830 regcppop();
3831 PL_reg_call_cc = cur_call_cc;
3832 PL_regcc = cctmp;
3833 PL_reg_re = re;
3834 cache_re(re);
3835
3836 DEBUG_r(
3837 PerlIO_printf(Perl_debug_log,
3838 "%*s continuation failed...\n",
3839 REPORT_CODE_OFF+PL_regindent*2, "")
3840 );
7821416a 3841 sayNO_SILENT;
0f5d15d6 3842 }
7821416a
IZ
3843 if (locinput < PL_regtill) {
3844 DEBUG_r(PerlIO_printf(Perl_debug_log,
3845 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3846 PL_colors[4],
3847 (long)(locinput - PL_reg_starttry),
3848 (long)(PL_regtill - PL_reg_starttry),
3849 PL_colors[5]));
3850 sayNO_FINAL; /* Cannot match: too short. */
3851 }
3852 PL_reginput = locinput; /* put where regtry can find it */
3853 sayYES_FINAL; /* Success! */
7e5428c5 3854 case SUCCEED:
3280af22 3855 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3856 sayYES_LOUD; /* Success! */
c277df42
IZ
3857 case SUSPEND:
3858 n = 1;
9fe1d20c 3859 PL_reginput = locinput;
9041c2e3 3860 goto do_ifmatch;
a0d0e21e 3861 case UNLESSM:
c277df42 3862 n = 0;
a0ed51b3 3863 if (scan->flags) {
efb30f32
HS
3864 s = HOPBACKc(locinput, scan->flags);
3865 if (!s)
3866 goto say_yes;
3867 PL_reginput = s;
a0ed51b3
LW
3868 }
3869 else
3870 PL_reginput = locinput;
c2b0868c 3871 PL_reg_flags ^= RF_false;
c277df42
IZ
3872 goto do_ifmatch;
3873 case IFMATCH:
3874 n = 1;
a0ed51b3 3875 if (scan->flags) {
efb30f32
HS
3876 s = HOPBACKc(locinput, scan->flags);
3877 if (!s)
3878 goto say_no;
3879 PL_reginput = s;
a0ed51b3
LW
3880 }
3881 else
3882 PL_reginput = locinput;
3883
c277df42 3884 do_ifmatch:
c277df42
IZ
3885 inner = NEXTOPER(NEXTOPER(scan));
3886 if (regmatch(inner) != n) {
c2b0868c
HS
3887 if (n == 0)
3888 PL_reg_flags ^= RF_false;
c277df42
IZ
3889 say_no:
3890 if (logical) {
3891 logical = 0;
3892 sw = 0;
3893 goto do_longjump;
a0ed51b3
LW
3894 }
3895 else
c277df42
IZ
3896 sayNO;
3897 }
c2b0868c
HS
3898 if (n == 0)
3899 PL_reg_flags ^= RF_false;
c277df42
IZ
3900 say_yes:
3901 if (logical) {
3902 logical = 0;
3903 sw = 1;
3904 }
fe44a5e8 3905 if (OP(scan) == SUSPEND) {
3280af22 3906 locinput = PL_reginput;
565764a8 3907 nextchr = UCHARAT(locinput);
fe44a5e8 3908 }
c277df42
IZ
3909 /* FALL THROUGH. */
3910 case LONGJMP:
3911 do_longjump:
3912 next = scan + ARG(scan);
3913 if (next == scan)
3914 next = NULL;
a0d0e21e
LW
3915 break;
3916 default:
b900a521 3917 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3918 PTR2UV(scan), OP(scan));
cea2e8a9 3919 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3920 }
02db2b7b 3921 reenter:
a0d0e21e
LW
3922 scan = next;
3923 }
a687059c 3924
a0d0e21e
LW
3925 /*
3926 * We get here only if there's trouble -- normally "case END" is
3927 * the terminating point.
3928 */
cea2e8a9 3929 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3930 /*NOTREACHED*/
4633a7c4
LW
3931 sayNO;
3932
7821416a
IZ
3933yes_loud:
3934 DEBUG_r(
3935 PerlIO_printf(Perl_debug_log,
3936 "%*s %scould match...%s\n",
3937 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3938 );
3939 goto yes;
3940yes_final:
3941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3942 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3943yes:
3944#ifdef DEBUGGING
3280af22 3945 PL_regindent--;
4633a7c4 3946#endif
02db2b7b
IZ
3947
3948#if 0 /* Breaks $^R */
3949 if (unwind)
3950 regcpblow(firstcp);
3951#endif
4633a7c4
LW
3952 return 1;
3953
3954no:
7821416a
IZ
3955 DEBUG_r(
3956 PerlIO_printf(Perl_debug_log,
3957 "%*s %sfailed...%s\n",
3958 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3959 );
3960 goto do_no;
3961no_final:
3962do_no:
02db2b7b
IZ
3963 if (unwind) {
3964 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3965
3966 switch (uw->type) {
3967 case RE_UNWIND_BRANCH:
3968 case RE_UNWIND_BRANCHJ:
3969 {
3970 re_unwind_branch_t *uwb = &(uw->branch);
3971 I32 lastparen = uwb->lastparen;
9041c2e3 3972
02db2b7b
IZ
3973 REGCP_UNWIND(uwb->lastcp);
3974 for (n = *PL_reglastparen; n > lastparen; n--)
3975 PL_regendp[n] = -1;
3976 *PL_reglastparen = n;
3977 scan = next = uwb->next;
9041c2e3
NIS
3978 if ( !scan ||
3979 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3980 ? BRANCH : BRANCHJ) ) { /* Failure */
3981 unwind = uwb->prev;
3982#ifdef DEBUGGING
3983 PL_regindent--;
3984#endif
3985 goto do_no;
3986 }
3987 /* Have more choice yet. Reuse the same uwb. */
3988 /*SUPPRESS 560*/
3989 if ((n = (uwb->type == RE_UNWIND_BRANCH
3990 ? NEXT_OFF(next) : ARG(next))))
3991 next += n;
3992 else
3993 next = NULL; /* XXXX Needn't unwinding in this case... */
3994 uwb->next = next;
3995 next = NEXTOPER(scan);
3996 if (uwb->type == RE_UNWIND_BRANCHJ)
3997 next = NEXTOPER(next);
3998 locinput = uwb->locinput;
3999 nextchr = uwb->nextchr;
4000#ifdef DEBUGGING
4001 PL_regindent = uwb->regindent;
4002#endif
4003
4004 goto reenter;
4005 }
4006 /* NOT REACHED */
4007 default:
4008 Perl_croak(aTHX_ "regexp unwind memory corruption");
4009 }
4010 /* NOT REACHED */
4011 }
4633a7c4 4012#ifdef DEBUGGING
3280af22 4013 PL_regindent--;
4633a7c4 4014#endif
a0d0e21e 4015 return 0;
a687059c
LW
4016}
4017
4018/*
4019 - regrepeat - repeatedly match something simple, report how many
4020 */
4021/*
4022 * [This routine now assumes that it will only match on things of length 1.
4023 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4024 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4025 */
76e3520e 4026STATIC I32
cea2e8a9 4027S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4028{
a0d0e21e 4029 register char *scan;
a0d0e21e 4030 register I32 c;
3280af22 4031 register char *loceol = PL_regeol;
a0ed51b3 4032 register I32 hardcount = 0;
53c4c00c 4033 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4034
3280af22 4035 scan = PL_reginput;
faf11cac
HS
4036 if (max == REG_INFTY)
4037 max = I32_MAX;
4038 else if (max < loceol - scan)
a0d0e21e 4039 loceol = scan + max;
a0d0e21e 4040 switch (OP(p)) {
22c35a8c 4041 case REG_ANY:
1aa99e6b 4042 if (do_utf8) {
ffc61ed2 4043 loceol = PL_regeol;
1aa99e6b 4044 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4045 scan += UTF8SKIP(scan);
4046 hardcount++;
4047 }
4048 } else {
4049 while (scan < loceol && *scan != '\n')
4050 scan++;
a0ed51b3
LW
4051 }
4052 break;
ffc61ed2 4053 case SANY:
def8e4ea
JH
4054 if (do_utf8) {
4055 loceol = PL_regeol;
a0804c9e 4056 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4057 scan += UTF8SKIP(scan);
4058 hardcount++;
4059 }
4060 }
4061 else
4062 scan = loceol;
a0ed51b3 4063 break;
f33976b4
DB
4064 case CANY:
4065 scan = loceol;
4066 break;
090f7165
JH
4067 case EXACT: /* length of string is 1 */
4068 c = (U8)*STRING(p);
4069 while (scan < loceol && UCHARAT(scan) == c)
4070 scan++;
bbce6d69 4071 break;
4072 case EXACTF: /* length of string is 1 */
cd439c50 4073 c = (U8)*STRING(p);
bbce6d69 4074 while (scan < loceol &&
22c35a8c 4075 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4076 scan++;
4077 break;
4078 case EXACTFL: /* length of string is 1 */
3280af22 4079 PL_reg_flags |= RF_tainted;
cd439c50 4080 c = (U8)*STRING(p);
bbce6d69 4081 while (scan < loceol &&
22c35a8c 4082 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4083 scan++;
4084 break;
4085 case ANYOF:
ffc61ed2
JH
4086 if (do_utf8) {
4087 loceol = PL_regeol;
cfc92286
JH
4088 while (hardcount < max && scan < loceol &&
4089 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4090 scan += UTF8SKIP(scan);
4091 hardcount++;
4092 }
4093 } else {
7d3e948e 4094 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4095 scan++;
4096 }
a0d0e21e
LW
4097 break;
4098 case ALNUM:
1aa99e6b 4099 if (do_utf8) {
ffc61ed2 4100 loceol = PL_regeol;
8269fa76 4101 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4102 while (hardcount < max && scan < loceol &&
3568d838 4103 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4104 scan += UTF8SKIP(scan);
4105 hardcount++;
4106 }
4107 } else {
4108 while (scan < loceol && isALNUM(*scan))
4109 scan++;
a0ed51b3
LW
4110 }
4111 break;
bbce6d69 4112 case ALNUML:
3280af22 4113 PL_reg_flags |= RF_tainted;
1aa99e6b 4114 if (do_utf8) {
ffc61ed2 4115 loceol = PL_regeol;
1aa99e6b
IH
4116 while (hardcount < max && scan < loceol &&
4117 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4118 scan += UTF8SKIP(scan);
4119 hardcount++;
4120 }
4121 } else {
4122 while (scan < loceol && isALNUM_LC(*scan))
4123 scan++;
a0ed51b3
LW
4124 }
4125 break;
a0d0e21e 4126 case NALNUM:
1aa99e6b 4127 if (do_utf8) {
ffc61ed2 4128 loceol = PL_regeol;
8269fa76 4129 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4130 while (hardcount < max && scan < loceol &&
3568d838 4131 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4132 scan += UTF8SKIP(scan);
4133 hardcount++;
4134 }
4135 } else {
4136 while (scan < loceol && !isALNUM(*scan))
4137 scan++;
a0ed51b3
LW
4138 }
4139 break;
bbce6d69 4140 case NALNUML:
3280af22 4141 PL_reg_flags |= RF_tainted;
1aa99e6b 4142 if (do_utf8) {
ffc61ed2 4143 loceol = PL_regeol;
1aa99e6b
IH
4144 while (hardcount < max && scan < loceol &&
4145 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4146 scan += UTF8SKIP(scan);
4147 hardcount++;
4148 }
4149 } else {
4150 while (scan < loceol && !isALNUM_LC(*scan))
4151 scan++;
a0ed51b3
LW
4152 }
4153 break;
a0d0e21e 4154 case SPACE:
1aa99e6b 4155 if (do_utf8) {
ffc61ed2 4156 loceol = PL_regeol;
8269fa76 4157 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4158 while (hardcount < max && scan < loceol &&
3568d838
JH
4159 (*scan == ' ' ||
4160 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4161 scan += UTF8SKIP(scan);
4162 hardcount++;
4163 }
4164 } else {
4165 while (scan < loceol && isSPACE(*scan))
4166 scan++;
a0ed51b3
LW
4167 }
4168 break;
bbce6d69 4169 case SPACEL:
3280af22 4170 PL_reg_flags |= RF_tainted;
1aa99e6b 4171 if (do_utf8) {
ffc61ed2 4172 loceol = PL_regeol;
1aa99e6b 4173 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4174 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4175 scan += UTF8SKIP(scan);
4176 hardcount++;
4177 }
4178 } else {
4179 while (scan < loceol && isSPACE_LC(*scan))
4180 scan++;
a0ed51b3
LW
4181 }
4182 break;
a0d0e21e 4183 case NSPACE:
1aa99e6b 4184 if (do_utf8) {
ffc61ed2 4185 loceol = PL_regeol;
8269fa76 4186 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4187 while (hardcount < max && scan < loceol &&
3568d838
JH
4188 !(*scan == ' ' ||
4189 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4190 scan += UTF8SKIP(scan);
4191 hardcount++;
4192 }
4193 } else {
4194 while (scan < loceol && !isSPACE(*scan))
4195 scan++;
4196 break;
a0ed51b3 4197 }
bbce6d69 4198 case NSPACEL:
3280af22 4199 PL_reg_flags |= RF_tainted;
1aa99e6b 4200 if (do_utf8) {
ffc61ed2 4201 loceol = PL_regeol;
1aa99e6b 4202 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4203 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4204 scan += UTF8SKIP(scan);
4205 hardcount++;
4206 }
4207 } else {
4208 while (scan < loceol && !isSPACE_LC(*scan))
4209 scan++;
a0ed51b3
LW
4210 }
4211 break;
a0d0e21e 4212 case DIGIT:
1aa99e6b 4213 if (do_utf8) {
ffc61ed2 4214 loceol = PL_regeol;
8269fa76 4215 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4216 while (hardcount < max && scan < loceol &&
3568d838 4217 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4218 scan += UTF8SKIP(scan);
4219 hardcount++;
4220 }
4221 } else {
4222 while (scan < loceol && isDIGIT(*scan))
4223 scan++;
a0ed51b3
LW
4224 }
4225 break;
a0d0e21e 4226 case NDIGIT:
1aa99e6b 4227 if (do_utf8) {
ffc61ed2 4228 loceol = PL_regeol;
8269fa76 4229 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4230 while (hardcount < max && scan < loceol &&
3568d838 4231 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4232 scan += UTF8SKIP(scan);
4233 hardcount++;
4234 }
4235 } else {
4236 while (scan < loceol && !isDIGIT(*scan))
4237 scan++;
a0ed51b3
LW
4238 }
4239 break;
a0d0e21e
LW
4240 default: /* Called on something of 0 width. */
4241 break; /* So match right here or not at all. */
4242 }
a687059c 4243
a0ed51b3
LW
4244 if (hardcount)
4245 c = hardcount;
4246 else
4247 c = scan - PL_reginput;
3280af22 4248 PL_reginput = scan;
a687059c 4249
9041c2e3 4250 DEBUG_r(
c277df42
IZ
4251 {
4252 SV *prop = sv_newmortal();
4253
4254 regprop(prop, p);
9041c2e3
NIS
4255 PerlIO_printf(Perl_debug_log,
4256 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4257 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4258 });
9041c2e3 4259
a0d0e21e 4260 return(c);
a687059c
LW
4261}
4262
4263/*
c277df42 4264 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4265 *
c277df42
IZ
4266 * The repeater is supposed to have constant length.
4267 */
4268
76e3520e 4269STATIC I32
cea2e8a9 4270S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4271{
b7953727 4272 register char *scan = Nullch;
c277df42 4273 register char *start;
3280af22 4274 register char *loceol = PL_regeol;
a0ed51b3 4275 I32 l = 0;
708e3b05 4276 I32 count = 0, res = 1;
a0ed51b3
LW
4277
4278 if (!max)
4279 return 0;
c277df42 4280
3280af22 4281 start = PL_reginput;
53c4c00c 4282 if (PL_reg_match_utf8) {
708e3b05 4283 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4284 if (!count++) {
4285 l = 0;
4286 while (start < PL_reginput) {
4287 l++;
4288 start += UTF8SKIP(start);
4289 }
4290 *lp = l;
4291 if (l == 0)
4292 return max;
4293 }
4294 if (count == max)
4295 return count;
4296 }
4297 }
4298 else {
708e3b05 4299 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4300 if (!count++) {
4301 *lp = l = PL_reginput - start;
4302 if (max != REG_INFTY && l*max < loceol - scan)
4303 loceol = scan + l*max;
4304 if (l == 0)
4305 return max;
c277df42
IZ
4306 }
4307 }
4308 }
708e3b05 4309 if (!res)
3280af22 4310 PL_reginput = scan;
9041c2e3 4311
a0ed51b3 4312 return count;
c277df42
IZ
4313}
4314
4315/*
ffc61ed2
JH
4316- regclass_swash - prepare the utf8 swash
4317*/
4318
4319SV *
9e55ce06 4320Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4321{
9e55ce06
JH
4322 SV *sw = NULL;
4323 SV *si = NULL;
4324 SV *alt = NULL;
ffc61ed2
JH
4325
4326 if (PL_regdata && PL_regdata->count) {
4327 U32 n = ARG(node);
4328
4329 if (PL_regdata->what[n] == 's') {
4330 SV *rv = (SV*)PL_regdata->data[n];
4331 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4332 SV **ary = AvARRAY(av);
9e55ce06 4333 SV **a, **b;
9041c2e3 4334
9e55ce06
JH
4335 /* See the end of regcomp.c:S_reglass() for
4336 * documentation of these array elements. */
4337
b11f357e
JH
4338 si = *ary;
4339 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4340 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4341
ffc61ed2
JH
4342 if (a)
4343 sw = *a;
4344 else if (si && doinit) {
4345 sw = swash_init("utf8", "", si, 1, 0);
4346 (void)av_store(av, 1, sw);
4347 }
9e55ce06
JH
4348 if (b)
4349 alt = *b;
ffc61ed2
JH
4350 }
4351 }
4352
9e55ce06
JH
4353 if (listsvp)
4354 *listsvp = si;
4355 if (altsvp)
4356 *altsvp = alt;
ffc61ed2
JH
4357
4358 return sw;
4359}
4360
4361/*
ba7b4546 4362 - reginclass - determine if a character falls into a character class
832705d4
JH
4363
4364 The n is the ANYOF regnode, the p is the target string, lenp
4365 is pointer to the maximum length of how far to go in the p
4366 (if the lenp is zero, UTF8SKIP(p) is used),
4367 do_utf8 tells whether the target string is in UTF-8.
4368
bbce6d69 4369 */
4370
76e3520e 4371STATIC bool
ba7b4546 4372S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4373{
ffc61ed2 4374 char flags = ANYOF_FLAGS(n);
bbce6d69 4375 bool match = FALSE;
cc07378b 4376 UV c = *p;
ae9ddab8 4377 STRLEN len = 0;
9e55ce06 4378 STRLEN plen;
1aa99e6b 4379
ae9ddab8
JH
4380 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4381 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4382 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4383
0f0076b4 4384 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4385 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4386 if (lenp)
4387 *lenp = 0;
ffc61ed2 4388 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4389 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4390 match = TRUE;
bbce6d69 4391 }
3568d838 4392 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4393 match = TRUE;
ffc61ed2 4394 if (!match) {
9e55ce06
JH
4395 AV *av;
4396 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4397
4398 if (sw) {
3568d838 4399 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4400 match = TRUE;
4401 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4402 if (!match && lenp && av) {
4403 I32 i;
4404
4405 for (i = 0; i <= av_len(av); i++) {
4406 SV* sv = *av_fetch(av, i, FALSE);
4407 STRLEN len;
4408 char *s = SvPV(sv, len);
4409
061b10df 4410 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4411 *lenp = len;
4412 match = TRUE;
4413 break;
4414 }
4415 }
4416 }
4417 if (!match) {
4a623e43
JH
4418 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4419 STRLEN tmplen;
4420
9e55ce06
JH
4421 to_utf8_fold(p, tmpbuf, &tmplen);
4422 if (swash_fetch(sw, tmpbuf, do_utf8))
4423 match = TRUE;
4424 }
ffc61ed2
JH
4425 }
4426 }
bbce6d69 4427 }
9e55ce06 4428 if (match && lenp && *lenp == 0)
0f0076b4 4429 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4430 }
1aa99e6b 4431 if (!match && c < 256) {
ffc61ed2
JH
4432 if (ANYOF_BITMAP_TEST(n, c))
4433 match = TRUE;
4434 else if (flags & ANYOF_FOLD) {
eb160463 4435 U8 f;
a0ed51b3 4436
ffc61ed2
JH
4437 if (flags & ANYOF_LOCALE) {
4438 PL_reg_flags |= RF_tainted;
4439 f = PL_fold_locale[c];
4440 }
4441 else
4442 f = PL_fold[c];
4443 if (f != c && ANYOF_BITMAP_TEST(n, f))
4444 match = TRUE;
4445 }
4446
4447 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4448 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4449 if (
4450 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4451 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4452 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4453 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4454 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4455 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4456 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4457 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4458 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4459 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4460 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4480 ) /* How's that for a conditional? */
4481 {
4482 match = TRUE;
4483 }
a0ed51b3 4484 }
a0ed51b3
LW
4485 }
4486
a0ed51b3
LW
4487 return (flags & ANYOF_INVERT) ? !match : match;
4488}
161b471a 4489
dfe13c55 4490STATIC U8 *
cea2e8a9 4491S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4492{
1aa99e6b
IH
4493 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4494}
4495
4496STATIC U8 *
4497S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4498{
a0ed51b3 4499 if (off >= 0) {
1aa99e6b 4500 while (off-- && s < lim) {
ffc61ed2 4501 /* XXX could check well-formedness here */
a0ed51b3 4502 s += UTF8SKIP(s);
ffc61ed2 4503 }
a0ed51b3
LW
4504 }
4505 else {
4506 while (off++) {
1aa99e6b 4507 if (s > lim) {
a0ed51b3 4508 s--;
ffc61ed2 4509 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4510 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4511 s--;
ffc61ed2
JH
4512 }
4513 /* XXX could check well-formedness here */
a0ed51b3
LW
4514 }
4515 }
4516 }
4517 return s;
4518}
161b471a 4519
dfe13c55 4520STATIC U8 *
1aa99e6b 4521S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4522{
1aa99e6b
IH
4523 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4524}
4525
4526STATIC U8 *
4527S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4528{
4529 if (off >= 0) {
1aa99e6b 4530 while (off-- && s < lim) {
ffc61ed2 4531 /* XXX could check well-formedness here */
a0ed51b3 4532 s += UTF8SKIP(s);
ffc61ed2 4533 }
a0ed51b3
LW
4534 if (off >= 0)
4535 return 0;
4536 }
4537 else {
4538 while (off++) {
1aa99e6b 4539 if (s > lim) {
a0ed51b3 4540 s--;
ffc61ed2 4541 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4542 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4543 s--;
ffc61ed2
JH
4544 }
4545 /* XXX could check well-formedness here */
a0ed51b3
LW
4546 }
4547 else
4548 break;
4549 }
4550 if (off <= 0)
4551 return 0;
4552 }
4553 return s;
4554}
51371543 4555
51371543 4556static void
acfe0abc 4557restore_pos(pTHX_ void *arg)
51371543 4558{
51371543
GS
4559 if (PL_reg_eval_set) {
4560 if (PL_reg_oldsaved) {
4561 PL_reg_re->subbeg = PL_reg_oldsaved;
4562 PL_reg_re->sublen = PL_reg_oldsavedlen;
ed252734
NC
4563#ifdef PERL_COPY_ON_WRITE
4564 PL_reg_re->saved_copy = PL_nrs;
4565#endif
51371543
GS
4566 RX_MATCH_COPIED_on(PL_reg_re);
4567 }
4568 PL_reg_magic->mg_len = PL_reg_oldpos;
4569 PL_reg_eval_set = 0;
4570 PL_curpm = PL_reg_oldcurpm;
4571 }
4572}
33b8afdf
JH
4573
4574STATIC void
4575S_to_utf8_substr(pTHX_ register regexp *prog)
4576{
4577 SV* sv;
4578 if (prog->float_substr && !prog->float_utf8) {
4579 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4580 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4581 sv_utf8_upgrade(sv);
4582 if (SvTAIL(prog->float_substr))
4583 SvTAIL_on(sv);
4584 if (prog->float_substr == prog->check_substr)
4585 prog->check_utf8 = sv;
4586 }
4587 if (prog->anchored_substr && !prog->anchored_utf8) {
4588 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4589 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4590 sv_utf8_upgrade(sv);
4591 if (SvTAIL(prog->anchored_substr))
4592 SvTAIL_on(sv);
4593 if (prog->anchored_substr == prog->check_substr)
4594 prog->check_utf8 = sv;
4595 }
4596}
4597
4598STATIC void
4599S_to_byte_substr(pTHX_ register regexp *prog)
4600{
4601 SV* sv;
4602 if (prog->float_utf8 && !prog->float_substr) {
4603 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4604 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4605 if (sv_utf8_downgrade(sv, TRUE)) {
4606 if (SvTAIL(prog->float_utf8))
4607 SvTAIL_on(sv);
4608 } else {
4609 SvREFCNT_dec(sv);
4610 prog->float_substr = sv = &PL_sv_undef;
4611 }
4612 if (prog->float_utf8 == prog->check_utf8)
4613 prog->check_substr = sv;
4614 }
4615 if (prog->anchored_utf8 && !prog->anchored_substr) {
4616 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4617 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4618 if (sv_utf8_downgrade(sv, TRUE)) {
4619 if (SvTAIL(prog->anchored_utf8))
4620 SvTAIL_on(sv);
4621 } else {
4622 SvREFCNT_dec(sv);
4623 prog->anchored_substr = sv = &PL_sv_undef;
4624 }
4625 if (prog->anchored_utf8 == prog->check_utf8)
4626 prog->check_substr = sv;
4627 }
4628}