This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
UNICODE regexp bug
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
4bb101f2
JH
70 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
71 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 72 ****
9ef589d8
LW
73 **** You may distribute under the terms of either the GNU General Public
74 **** License or the Artistic License, as specified in the README file.
a687059c
LW
75 *
76 * Beware that some of this code is subtly aware of the way operator
77 * precedence is structured in regular expressions. Serious changes in
78 * regular-expression syntax might require a total rethink.
79 */
80#include "EXTERN.h"
864dbfa3 81#define PERL_IN_REGEXEC_C
a687059c 82#include "perl.h"
0f5d15d6 83
a687059c
LW
84#include "regcomp.h"
85
c277df42
IZ
86#define RF_tainted 1 /* tainted information used? */
87#define RF_warned 2 /* warned about big count? */
ce862d02 88#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3 89#define RF_utf8 8 /* String contains multibyte chars? */
c2b0868c 90#define RF_false 16 /* odd number of nested negatives */
a0ed51b3 91
eb160463 92#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
93
94#define RS_init 1 /* eval environment created */
95#define RS_set 2 /* replsv value is set */
c277df42 96
a687059c
LW
97#ifndef STATIC
98#define STATIC static
99#endif
100
ba7b4546 101#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 102
c277df42
IZ
103/*
104 * Forwards.
105 */
106
33b8afdf 107#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 108#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 109
dfe13c55
GS
110#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
111#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
112#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
113#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
114#define HOPc(pos,off) ((char*)HOP(pos,off))
115#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 116
efb30f32 117#define HOPBACK(pos, off) ( \
e54858b0 118 (PL_reg_match_utf8) \
efb30f32
HS
119 ? reghopmaybe((U8*)pos, -off) \
120 : (pos - off >= PL_bostr) \
121 ? (U8*)(pos - off) \
122 : (U8*)NULL \
123)
124#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
125
1aa99e6b
IH
126#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
127#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
128#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
129#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
130#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
131#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
132
977e4b84 133#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 134
5f80c4cf 135/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
136#define JUMPABLE(rn) ( \
137 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
138 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
139 OP(rn) == PLUS || OP(rn) == MINMOD || \
140 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
141)
142
cca55fe3
JP
143#define HAS_TEXT(rn) ( \
144 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
145)
e2d8ce26 146
a84d97b6
HS
147/*
148 Search for mandatory following text node; for lookahead, the text must
149 follow but for lookbehind (rn->flags != 0) we skip to the next step.
150*/
cca55fe3 151#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 152 while (JUMPABLE(rn)) \
a84d97b6 153 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 154 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
155 else if (OP(rn) == PLUS) \
156 rn = NEXTOPER(rn); \
a84d97b6
HS
157 else if (OP(rn) == IFMATCH) \
158 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 159 else rn += NEXT_OFF(rn); \
5f80c4cf 160} STMT_END
74750237 161
acfe0abc 162static void restore_pos(pTHX_ void *arg);
51371543 163
76e3520e 164STATIC CHECKPOINT
cea2e8a9 165S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 166{
3280af22 167 int retval = PL_savestack_ix;
b1ce53c5
JH
168#define REGCP_PAREN_ELEMS 4
169 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
170 int p;
171
e49a9654
IH
172 if (paren_elems_to_push < 0)
173 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
174
a01268b5 175#define REGCP_OTHER_ELEMS 6
4b3c1a47 176 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 177 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 178/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
179 SSPUSHINT(PL_regendp[p]);
180 SSPUSHINT(PL_regstartp[p]);
3280af22 181 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
182 SSPUSHINT(p);
183 }
b1ce53c5 184/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
185 SSPUSHINT(PL_regsize);
186 SSPUSHINT(*PL_reglastparen);
a01268b5 187 SSPUSHINT(*PL_reglastcloseparen);
3280af22 188 SSPUSHPTR(PL_reginput);
41123dfd
JH
189#define REGCP_FRAME_ELEMS 2
190/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
191 * are needed for the regexp context stack bookkeeping. */
192 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 193 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 194
a0d0e21e
LW
195 return retval;
196}
197
c277df42 198/* These are needed since we do not localize EVAL nodes: */
02db2b7b 199# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 200 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 201 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 202
02db2b7b 203# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 204 PerlIO_printf(Perl_debug_log, \
faccc32b 205 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 206 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 207
76e3520e 208STATIC char *
cea2e8a9 209S_regcppop(pTHX)
a0d0e21e 210{
b1ce53c5 211 I32 i;
a0d0e21e
LW
212 U32 paren = 0;
213 char *input;
cf93c79d 214 I32 tmps;
b1ce53c5
JH
215
216 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 217 i = SSPOPINT;
b1ce53c5
JH
218 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
219 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 220 input = (char *) SSPOPPTR;
a01268b5 221 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
222 *PL_reglastparen = SSPOPINT;
223 PL_regsize = SSPOPINT;
b1ce53c5
JH
224
225 /* Now restore the parentheses context. */
41123dfd
JH
226 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
227 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 228 paren = (U32)SSPOPINT;
3280af22 229 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
230 PL_regstartp[paren] = SSPOPINT;
231 tmps = SSPOPINT;
3280af22
NIS
232 if (paren <= *PL_reglastparen)
233 PL_regendp[paren] = tmps;
c277df42 234 DEBUG_r(
c3464db5 235 PerlIO_printf(Perl_debug_log,
b900a521 236 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 237 (UV)paren, (IV)PL_regstartp[paren],
b900a521 238 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 239 (IV)PL_regendp[paren],
3280af22 240 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 241 );
a0d0e21e 242 }
c277df42 243 DEBUG_r(
eb160463 244 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 245 PerlIO_printf(Perl_debug_log,
faccc32b
JH
246 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
247 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
248 }
249 );
daf18116 250#if 1
dafc8851
JH
251 /* It would seem that the similar code in regtry()
252 * already takes care of this, and in fact it is in
253 * a better location to since this code can #if 0-ed out
254 * but the code in regtry() is needed or otherwise tests
255 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
256 * (as of patchlevel 7877) will fail. Then again,
257 * this code seems to be necessary or otherwise
258 * building DynaLoader will fail:
259 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
260 * --jhi */
eb160463
GS
261 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
262 if ((I32)paren > PL_regsize)
cf93c79d
IZ
263 PL_regstartp[paren] = -1;
264 PL_regendp[paren] = -1;
a0d0e21e 265 }
dafc8851 266#endif
a0d0e21e
LW
267 return input;
268}
269
0f5d15d6 270STATIC char *
cea2e8a9 271S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
272{
273 I32 tmp = PL_savestack_ix;
274
275 PL_savestack_ix = ss;
276 regcppop();
277 PL_savestack_ix = tmp;
942e002e 278 return Nullch;
0f5d15d6
IZ
279}
280
281typedef struct re_cc_state
282{
283 I32 ss;
284 regnode *node;
285 struct re_cc_state *prev;
286 CURCUR *cc;
287 regexp *re;
288} re_cc_state;
289
02db2b7b 290#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 291
29d1e993
HS
292#define TRYPAREN(paren, n, input) { \
293 if (paren) { \
294 if (n) { \
295 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
296 PL_regendp[paren] = input - PL_bostr; \
297 } \
298 else \
299 PL_regendp[paren] = -1; \
300 } \
301 if (regmatch(next)) \
302 sayYES; \
303 if (paren && n) \
304 PL_regendp[paren] = -1; \
305}
306
307
a687059c 308/*
e50aee73 309 * pregexec and friends
a687059c
LW
310 */
311
312/*
c277df42 313 - pregexec - match a regexp against a string
a687059c 314 */
c277df42 315I32
864dbfa3 316Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 317 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
318/* strend: pointer to null at end of string */
319/* strbeg: real beginning of string */
320/* minend: end of match must be >=minend after stringarg. */
321/* nosave: For optimizations. */
322{
323 return
9041c2e3 324 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
325 nosave ? 0 : REXEC_COPY_STR);
326}
0f5d15d6
IZ
327
328STATIC void
cea2e8a9 329S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
330{
331 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
332#ifdef DEBUGGING
333 PL_regprogram = prog->program;
334#endif
335 PL_regnpar = prog->nparens;
9041c2e3
NIS
336 PL_regdata = prog->data;
337 PL_reg_re = prog;
0f5d15d6 338}
22e551b9 339
9041c2e3 340/*
cad2e5aa
JH
341 * Need to implement the following flags for reg_anch:
342 *
343 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
344 * USE_INTUIT_ML
345 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
346 * INTUIT_AUTORITATIVE_ML
347 * INTUIT_ONCE_NOML - Intuit can match in one location only.
348 * INTUIT_ONCE_ML
349 *
350 * Another flag for this function: SECOND_TIME (so that float substrs
351 * with giant delta may be not rechecked).
352 */
353
354/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
355
2c2d71f5 356/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
357 Otherwise, only SvCUR(sv) is used to get strbeg. */
358
359/* XXXX We assume that strpos is strbeg unless sv. */
360
6eb5f6b9
JH
361/* XXXX Some places assume that there is a fixed substring.
362 An update may be needed if optimizer marks as "INTUITable"
363 RExen without fixed substrings. Similarly, it is assumed that
364 lengths of all the strings are no more than minlen, thus they
365 cannot come from lookahead.
366 (Or minlen should take into account lookahead.) */
367
2c2d71f5
JH
368/* A failure to find a constant substring means that there is no need to make
369 an expensive call to REx engine, thus we celebrate a failure. Similarly,
370 finding a substring too deep into the string means that less calls to
30944b6d
IZ
371 regtry() should be needed.
372
373 REx compiler's optimizer found 4 possible hints:
374 a) Anchored substring;
375 b) Fixed substring;
376 c) Whether we are anchored (beginning-of-line or \G);
377 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 378 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
379 string which does not contradict any of them.
380 */
2c2d71f5 381
6eb5f6b9
JH
382/* Most of decisions we do here should have been done at compile time.
383 The nodes of the REx which we used for the search should have been
384 deleted from the finite automaton. */
385
cad2e5aa
JH
386char *
387Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
388 char *strend, U32 flags, re_scream_pos_data *data)
389{
b7953727 390 register I32 start_shift = 0;
cad2e5aa 391 /* Should be nonnegative! */
b7953727 392 register I32 end_shift = 0;
2c2d71f5
JH
393 register char *s;
394 register SV *check;
a1933d95 395 char *strbeg;
cad2e5aa 396 char *t;
33b8afdf 397 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 398 I32 ml_anch;
6eb5f6b9 399 register char *other_last = Nullch; /* other substr checked before this */
b7953727 400 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
401#ifdef DEBUGGING
402 char *i_strpos = strpos;
ce333219 403 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 404#endif
a30b2f1f 405 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 406
b8d68ded
JH
407 if (prog->reganch & ROPT_UTF8) {
408 DEBUG_r(PerlIO_printf(Perl_debug_log,
409 "UTF-8 regex...\n"));
410 PL_reg_flags |= RF_utf8;
411 }
412
2a782b5b 413 DEBUG_r({
b8d68ded 414 char *s = PL_reg_match_utf8 ?
c728cb41
JH
415 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
416 strpos;
b8d68ded
JH
417 int len = PL_reg_match_utf8 ?
418 strlen(s) : strend - strpos;
2a782b5b
JH
419 if (!PL_colorset)
420 reginitcolors();
b8d68ded
JH
421 if (PL_reg_match_utf8)
422 DEBUG_r(PerlIO_printf(Perl_debug_log,
423 "UTF-8 target...\n"));
2a782b5b
JH
424 PerlIO_printf(Perl_debug_log,
425 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
426 PL_colors[4],PL_colors[5],PL_colors[0],
427 prog->precomp,
428 PL_colors[1],
429 (strlen(prog->precomp) > 60 ? "..." : ""),
430 PL_colors[0],
431 (int)(len > 60 ? 60 : len),
432 s, PL_colors[1],
433 (len > 60 ? "..." : "")
434 );
435 });
cad2e5aa 436
c344f387
JH
437 /* CHR_DIST() would be more correct here but it makes things slow. */
438 if (prog->minlen > strend - strpos) {
a72c7584
JH
439 DEBUG_r(PerlIO_printf(Perl_debug_log,
440 "String too short... [re_intuit_start]\n"));
cad2e5aa 441 goto fail;
2c2d71f5 442 }
a1933d95 443 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 444 PL_regeol = strend;
33b8afdf
JH
445 if (do_utf8) {
446 if (!prog->check_utf8 && prog->check_substr)
447 to_utf8_substr(prog);
448 check = prog->check_utf8;
449 } else {
450 if (!prog->check_substr && prog->check_utf8)
451 to_byte_substr(prog);
452 check = prog->check_substr;
453 }
454 if (check == &PL_sv_undef) {
455 DEBUG_r(PerlIO_printf(Perl_debug_log,
456 "Non-utf string cannot match utf check string\n"));
457 goto fail;
458 }
2c2d71f5 459 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
460 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
461 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 462 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 463
7e25d62c
JH
464 if (!ml_anch) {
465 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
466 | ROPT_IMPLICIT)) /* not a real BOL */
467 /* SvCUR is not set on references: SvRV and SvPVX overlap */
468 && sv && !SvROK(sv)
469 && (strpos != strbeg)) {
470 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
471 goto fail;
472 }
473 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 474 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 475 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
476 I32 slen;
477
1aa99e6b 478 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
479 if (SvTAIL(check)) {
480 slen = SvCUR(check); /* >= 1 */
cad2e5aa 481
9041c2e3 482 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
483 || (strend - s == slen && strend[-1] != '\n')) {
484 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
485 goto fail_finish;
cad2e5aa
JH
486 }
487 /* Now should match s[0..slen-2] */
488 slen--;
653099ff 489 if (slen && (*SvPVX(check) != *s
cad2e5aa 490 || (slen > 1
653099ff 491 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
492 report_neq:
493 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
494 goto fail_finish;
495 }
cad2e5aa 496 }
653099ff
GS
497 else if (*SvPVX(check) != *s
498 || ((slen = SvCUR(check)) > 1
499 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
500 goto report_neq;
501 goto success_at_start;
7e25d62c 502 }
cad2e5aa 503 }
2c2d71f5 504 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 505 s = strpos;
2c2d71f5 506 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 507 end_shift = prog->minlen - start_shift -
653099ff 508 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 509 if (!ml_anch) {
653099ff
GS
510 I32 end = prog->check_offset_max + CHR_SVLEN(check)
511 - (SvTAIL(check) != 0);
1aa99e6b 512 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
513
514 if (end_shift < eshift)
515 end_shift = eshift;
516 }
cad2e5aa 517 }
2c2d71f5 518 else { /* Can match at random position */
cad2e5aa
JH
519 ml_anch = 0;
520 s = strpos;
2c2d71f5
JH
521 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
522 /* Should be nonnegative! */
523 end_shift = prog->minlen - start_shift -
653099ff 524 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
525 }
526
2c2d71f5 527#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 528 if (end_shift < 0)
6bbae5e6 529 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
530#endif
531
2c2d71f5
JH
532 restart:
533 /* Find a possible match in the region s..strend by looking for
534 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 535 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
536 I32 p = -1; /* Internal iterator of scream. */
537 I32 *pp = data ? data->scream_pos : &p;
538
2c2d71f5
JH
539 if (PL_screamfirst[BmRARE(check)] >= 0
540 || ( BmRARE(check) == '\n'
541 && (BmPREVIOUS(check) == SvCUR(check) - 1)
542 && SvTAIL(check) ))
9041c2e3 543 s = screaminstr(sv, check,
2c2d71f5 544 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 545 else
2c2d71f5 546 goto fail_finish;
4addbd3b
HS
547 /* we may be pointing at the wrong string */
548 if (s && RX_MATCH_COPIED(prog))
7ef91622 549 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
550 if (data)
551 *data->scream_olds = s;
552 }
f33976b4 553 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
554 s = fbm_instr((U8*)(s + start_shift),
555 (U8*)(strend - end_shift),
556 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 557 else
1aa99e6b
IH
558 s = fbm_instr(HOP3(s, start_shift, strend),
559 HOP3(strend, -end_shift, strbeg),
2c2d71f5 560 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
561
562 /* Update the count-of-usability, remove useless subpatterns,
563 unshift s. */
2c2d71f5
JH
564
565 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
566 (s ? "Found" : "Did not find"),
33b8afdf 567 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 568 PL_colors[0],
7b0972df
JH
569 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
570 SvPVX(check),
2c2d71f5
JH
571 PL_colors[1], (SvTAIL(check) ? "$" : ""),
572 (s ? " at offset " : "...\n") ) );
573
574 if (!s)
575 goto fail_finish;
576
6eb5f6b9
JH
577 check_at = s;
578
2c2d71f5 579 /* Finish the diagnostic message */
30944b6d 580 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
581
582 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
583 Start with the other substr.
584 XXXX no SCREAM optimization yet - and a very coarse implementation
585 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
586 *always* match. Probably should be marked during compile...
587 Probably it is right to do no SCREAM here...
588 */
589
33b8afdf 590 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 591 /* Take into account the "other" substring. */
2c2d71f5
JH
592 /* XXXX May be hopelessly wrong for UTF... */
593 if (!other_last)
6eb5f6b9 594 other_last = strpos;
33b8afdf 595 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
596 do_other_anchored:
597 {
1aa99e6b 598 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 599 char *s1 = s;
33b8afdf 600 SV* must;
2c2d71f5 601
2c2d71f5
JH
602 t = s - prog->check_offset_max;
603 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 604 && (!do_utf8
1aa99e6b 605 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 606 && t > strpos)))
30944b6d 607 /* EMPTY */;
2c2d71f5
JH
608 else
609 t = strpos;
1aa99e6b 610 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
611 if (t < other_last) /* These positions already checked */
612 t = other_last;
1aa99e6b 613 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
614 if (last < last1)
615 last1 = last;
616 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
617 /* On end-of-str: see comment below. */
33b8afdf
JH
618 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
619 if (must == &PL_sv_undef) {
620 s = (char*)NULL;
621 DEBUG_r(must = prog->anchored_utf8); /* for debug */
622 }
623 else
624 s = fbm_instr(
625 (unsigned char*)t,
626 HOP3(HOP3(last1, prog->anchored_offset, strend)
627 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
628 must,
629 PL_multiline ? FBMrf_MULTILINE : 0
630 );
1aa99e6b
IH
631 DEBUG_r(PerlIO_printf(Perl_debug_log,
632 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
633 (s ? "Found" : "Contradicts"),
634 PL_colors[0],
33b8afdf
JH
635 (int)(SvCUR(must)
636 - (SvTAIL(must)!=0)),
637 SvPVX(must),
638 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
639 if (!s) {
640 if (last1 >= last2) {
641 DEBUG_r(PerlIO_printf(Perl_debug_log,
642 ", giving up...\n"));
643 goto fail_finish;
644 }
645 DEBUG_r(PerlIO_printf(Perl_debug_log,
646 ", trying floating at offset %ld...\n",
1aa99e6b
IH
647 (long)(HOP3c(s1, 1, strend) - i_strpos)));
648 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
649 s = HOP3c(last, 1, strend);
2c2d71f5
JH
650 goto restart;
651 }
652 else {
653 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 654 (long)(s - i_strpos)));
1aa99e6b
IH
655 t = HOP3c(s, -prog->anchored_offset, strbeg);
656 other_last = HOP3c(s, 1, strend);
30944b6d 657 s = s1;
2c2d71f5
JH
658 if (t == strpos)
659 goto try_at_start;
2c2d71f5
JH
660 goto try_at_offset;
661 }
30944b6d 662 }
2c2d71f5
JH
663 }
664 else { /* Take into account the floating substring. */
33b8afdf
JH
665 char *last, *last1;
666 char *s1 = s;
667 SV* must;
668
669 t = HOP3c(s, -start_shift, strbeg);
670 last1 = last =
671 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
672 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
673 last = HOP3c(t, prog->float_max_offset, strend);
674 s = HOP3c(t, prog->float_min_offset, strend);
675 if (s < other_last)
676 s = other_last;
2c2d71f5 677 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
678 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
679 /* fbm_instr() takes into account exact value of end-of-str
680 if the check is SvTAIL(ed). Since false positives are OK,
681 and end-of-str is not later than strend we are OK. */
682 if (must == &PL_sv_undef) {
683 s = (char*)NULL;
684 DEBUG_r(must = prog->float_utf8); /* for debug message */
685 }
686 else
2c2d71f5 687 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
688 (unsigned char*)last + SvCUR(must)
689 - (SvTAIL(must)!=0),
690 must, PL_multiline ? FBMrf_MULTILINE : 0);
691 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
692 (s ? "Found" : "Contradicts"),
693 PL_colors[0],
694 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
695 SvPVX(must),
696 PL_colors[1], (SvTAIL(must) ? "$" : "")));
697 if (!s) {
698 if (last1 == last) {
2c2d71f5 699 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
700 ", giving up...\n"));
701 goto fail_finish;
2c2d71f5 702 }
33b8afdf
JH
703 DEBUG_r(PerlIO_printf(Perl_debug_log,
704 ", trying anchored starting at offset %ld...\n",
705 (long)(s1 + 1 - i_strpos)));
706 other_last = last;
707 s = HOP3c(t, 1, strend);
708 goto restart;
709 }
710 else {
711 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
712 (long)(s - i_strpos)));
713 other_last = s; /* Fix this later. --Hugo */
714 s = s1;
715 if (t == strpos)
716 goto try_at_start;
717 goto try_at_offset;
718 }
2c2d71f5 719 }
cad2e5aa 720 }
2c2d71f5
JH
721
722 t = s - prog->check_offset_max;
2c2d71f5 723 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 724 && (!do_utf8
1aa99e6b
IH
725 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
726 && t > strpos))) {
2c2d71f5
JH
727 /* Fixed substring is found far enough so that the match
728 cannot start at strpos. */
729 try_at_offset:
cad2e5aa 730 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
731 /* Eventually fbm_*() should handle this, but often
732 anchored_offset is not 0, so this check will not be wasted. */
733 /* XXXX In the code below we prefer to look for "^" even in
734 presence of anchored substrings. And we search even
735 beyond the found float position. These pessimizations
736 are historical artefacts only. */
737 find_anchor:
2c2d71f5 738 while (t < strend - prog->minlen) {
cad2e5aa 739 if (*t == '\n') {
4ee3650e 740 if (t < check_at - prog->check_offset_min) {
33b8afdf 741 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
742 /* Since we moved from the found position,
743 we definitely contradict the found anchored
30944b6d
IZ
744 substr. Due to the above check we do not
745 contradict "check" substr.
746 Thus we can arrive here only if check substr
747 is float. Redo checking for "other"=="fixed".
748 */
9041c2e3 749 strpos = t + 1;
30944b6d
IZ
750 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
751 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
752 goto do_other_anchored;
753 }
4ee3650e
GS
754 /* We don't contradict the found floating substring. */
755 /* XXXX Why not check for STCLASS? */
cad2e5aa 756 s = t + 1;
2c2d71f5 757 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 758 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
759 goto set_useful;
760 }
4ee3650e
GS
761 /* Position contradicts check-string */
762 /* XXXX probably better to look for check-string
763 than for "\n", so one should lower the limit for t? */
764 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 765 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 766 other_last = strpos = s = t + 1;
cad2e5aa
JH
767 goto restart;
768 }
769 t++;
770 }
2c2d71f5
JH
771 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
772 PL_colors[0],PL_colors[1]));
773 goto fail_finish;
cad2e5aa 774 }
f5952150
GS
775 else {
776 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
777 PL_colors[0],PL_colors[1]));
778 }
cad2e5aa
JH
779 s = t;
780 set_useful:
33b8afdf 781 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
782 }
783 else {
f5952150 784 /* The found string does not prohibit matching at strpos,
2c2d71f5 785 - no optimization of calling REx engine can be performed,
f5952150
GS
786 unless it was an MBOL and we are not after MBOL,
787 or a future STCLASS check will fail this. */
2c2d71f5
JH
788 try_at_start:
789 /* Even in this situation we may use MBOL flag if strpos is offset
790 wrt the start of the string. */
05b4157f 791 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 792 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
793 /* May be due to an implicit anchor of m{.*foo} */
794 && !(prog->reganch & ROPT_IMPLICIT))
795 {
cad2e5aa
JH
796 t = strpos;
797 goto find_anchor;
798 }
30944b6d 799 DEBUG_r( if (ml_anch)
f5952150
GS
800 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
801 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 802 );
2c2d71f5 803 success_at_start:
30944b6d 804 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
805 && (do_utf8 ? (
806 prog->check_utf8 /* Could be deleted already */
807 && --BmUSEFUL(prog->check_utf8) < 0
808 && (prog->check_utf8 == prog->float_utf8)
809 ) : (
810 prog->check_substr /* Could be deleted already */
811 && --BmUSEFUL(prog->check_substr) < 0
812 && (prog->check_substr == prog->float_substr)
813 )))
66e933ab 814 {
cad2e5aa 815 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 816 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
817 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
818 if (do_utf8 ? prog->check_substr : prog->check_utf8)
819 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
820 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
821 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 822 check = Nullsv; /* abort */
cad2e5aa 823 s = strpos;
3cf5c195
IZ
824 /* XXXX This is a remnant of the old implementation. It
825 looks wasteful, since now INTUIT can use many
6eb5f6b9 826 other heuristics. */
cad2e5aa
JH
827 prog->reganch &= ~RE_USE_INTUIT;
828 }
829 else
830 s = strpos;
831 }
832
6eb5f6b9
JH
833 /* Last resort... */
834 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
835 if (prog->regstclass) {
836 /* minlen == 0 is possible if regstclass is \b or \B,
837 and the fixed substr is ''$.
838 Since minlen is already taken into account, s+1 is before strend;
839 accidentally, minlen >= 1 guaranties no false positives at s + 1
840 even for \b or \B. But (minlen? 1 : 0) below assumes that
841 regstclass does not come from lookahead... */
842 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
843 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 844 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 845 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 846 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 847 : 1);
33b8afdf 848 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 849 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 850 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
851 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
852 cl_l, strend)
853 : strend);
a1933d95 854 char *startpos = strbeg;
6eb5f6b9
JH
855
856 t = s;
9926ca43 857 cache_re(prog);
f33976b4 858 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
859 if (!s) {
860#ifdef DEBUGGING
b7953727 861 char *what = 0;
6eb5f6b9
JH
862#endif
863 if (endpos == strend) {
864 DEBUG_r( PerlIO_printf(Perl_debug_log,
865 "Could not match STCLASS...\n") );
866 goto fail;
867 }
66e933ab
GS
868 DEBUG_r( PerlIO_printf(Perl_debug_log,
869 "This position contradicts STCLASS...\n") );
653099ff
GS
870 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
871 goto fail;
6eb5f6b9 872 /* Contradict one of substrings */
33b8afdf
JH
873 if (prog->anchored_substr || prog->anchored_utf8) {
874 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
875 DEBUG_r( what = "anchored" );
876 hop_and_restart:
1aa99e6b 877 s = HOP3c(t, 1, strend);
66e933ab
GS
878 if (s + start_shift + end_shift > strend) {
879 /* XXXX Should be taken into account earlier? */
880 DEBUG_r( PerlIO_printf(Perl_debug_log,
881 "Could not match STCLASS...\n") );
882 goto fail;
883 }
5e39e1e5
HS
884 if (!check)
885 goto giveup;
6eb5f6b9 886 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 887 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
888 what, (long)(s + start_shift - i_strpos)) );
889 goto restart;
890 }
66e933ab 891 /* Have both, check_string is floating */
6eb5f6b9
JH
892 if (t + start_shift >= check_at) /* Contradicts floating=check */
893 goto retry_floating_check;
894 /* Recheck anchored substring, but not floating... */
9041c2e3 895 s = check_at;
5e39e1e5
HS
896 if (!check)
897 goto giveup;
6eb5f6b9 898 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 899 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
900 (long)(other_last - i_strpos)) );
901 goto do_other_anchored;
902 }
60e71179
GS
903 /* Another way we could have checked stclass at the
904 current position only: */
905 if (ml_anch) {
906 s = t = t + 1;
5e39e1e5
HS
907 if (!check)
908 goto giveup;
60e71179 909 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
910 "Looking for /%s^%s/m starting at offset %ld...\n",
911 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 912 goto try_at_offset;
66e933ab 913 }
33b8afdf 914 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 915 goto fail;
6eb5f6b9
JH
916 /* Check is floating subtring. */
917 retry_floating_check:
918 t = check_at - start_shift;
919 DEBUG_r( what = "floating" );
920 goto hop_and_restart;
921 }
b7953727
JH
922 if (t != s) {
923 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 924 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
925 (long)(t - i_strpos), (long)(s - i_strpos))
926 );
927 }
928 else {
929 DEBUG_r(PerlIO_printf(Perl_debug_log,
930 "Does not contradict STCLASS...\n");
931 );
932 }
6eb5f6b9 933 }
5e39e1e5
HS
934 giveup:
935 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
936 PL_colors[4], (check ? "Guessed" : "Giving up"),
937 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 938 return s;
2c2d71f5
JH
939
940 fail_finish: /* Substring not found */
33b8afdf
JH
941 if (prog->check_substr || prog->check_utf8) /* could be removed already */
942 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 943 fail:
2c2d71f5 944 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
945 PL_colors[4],PL_colors[5]));
946 return Nullch;
947}
9661b544 948
6eb5f6b9 949/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
950STATIC char *
951S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 952{
6eb5f6b9
JH
953 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
954 char *m;
d8093b23
G
955 STRLEN ln;
956 unsigned int c1;
957 unsigned int c2;
6eb5f6b9
JH
958 char *e;
959 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 960 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 961
6eb5f6b9
JH
962 /* We know what class it must start with. */
963 switch (OP(c)) {
6eb5f6b9 964 case ANYOF:
388cc4de
HS
965 if (do_utf8) {
966 while (s < strend) {
967 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
968 !UTF8_IS_INVARIANT((U8)s[0]) ?
969 reginclass(c, (U8*)s, 0, do_utf8) :
970 REGINCLASS(c, (U8*)s)) {
971 if (tmp && (norun || regtry(prog, s)))
972 goto got_it;
973 else
974 tmp = doevery;
975 }
976 else
977 tmp = 1;
978 s += UTF8SKIP(s);
979 }
980 }
981 else {
982 while (s < strend) {
983 STRLEN skip = 1;
984
985 if (REGINCLASS(c, (U8*)s) ||
986 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
987 /* The assignment of 2 is intentional:
988 * for the folded sharp s, the skip is 2. */
989 (skip = SHARP_S_SKIP))) {
990 if (tmp && (norun || regtry(prog, s)))
991 goto got_it;
992 else
993 tmp = doevery;
994 }
995 else
996 tmp = 1;
997 s += skip;
998 }
a0d0e21e 999 }
6eb5f6b9 1000 break;
f33976b4
DB
1001 case CANY:
1002 while (s < strend) {
1003 if (tmp && (norun || regtry(prog, s)))
1004 goto got_it;
1005 else
1006 tmp = doevery;
1007 s++;
1008 }
1009 break;
6eb5f6b9
JH
1010 case EXACTF:
1011 m = STRING(c);
1012 ln = STR_LEN(c);
1aa99e6b 1013 if (UTF) {
a2a2844f 1014 STRLEN ulen1, ulen2;
e7ae6809
JH
1015 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1016 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1017
1018 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1019 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1020
872c91ae
JH
1021 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1022 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1023 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1024 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1025 }
1026 else {
1027 c1 = *(U8*)m;
1028 c2 = PL_fold[c1];
1029 }
6eb5f6b9
JH
1030 goto do_exactf;
1031 case EXACTFL:
1032 m = STRING(c);
1033 ln = STR_LEN(c);
d8093b23 1034 c1 = *(U8*)m;
6eb5f6b9
JH
1035 c2 = PL_fold_locale[c1];
1036 do_exactf:
c1e0e3d2
NIS
1037 /* The last byte to try is ln-1 characters before strend
1038 * since the strend points one byte past the string. */
1039 e = HOP3c(strend, (I32)1 - (I32)ln, s);
b3c9acc1 1040
6eb5f6b9
JH
1041 if (norun && e < s)
1042 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1043
60a8b682
JH
1044 /* The idea in the EXACTF* cases is to first find the
1045 * first character of the EXACTF* node and then, if
1046 * necessary, case-insensitively compare the full
1047 * text of the node. The c1 and c2 are the first
1048 * characters (though in Unicode it gets a bit
1049 * more complicated because there are more cases
7f16dd3d
JH
1050 * than just upper and lower: one needs to use
1051 * the so-called folding case for case-insensitive
1052 * matching (called "loose matching" in Unicode).
1053 * ibcmp_utf8() will do just that. */
60a8b682 1054
1aa99e6b 1055 if (do_utf8) {
575cac57
JH
1056 UV c, f;
1057 U8 tmpbuf [UTF8_MAXLEN+1];
1058 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1059 STRLEN len, foldlen;
d7f013c8 1060
09091399 1061 if (c1 == c2) {
1aa99e6b 1062 while (s <= e) {
872c91ae
JH
1063 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1064 ckWARN(WARN_UTF8) ?
1065 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1066 if ( c == c1
1067 && (ln == len ||
66423254 1068 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1069 m, (char **)0, ln, (bool)UTF))
55da9344 1070 && (norun || regtry(prog, s)) )
1aa99e6b 1071 goto got_it;
80aecb99
JH
1072 else {
1073 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1074 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1075 if ( f != c
1076 && (f == c1 || f == c2)
1077 && (ln == foldlen ||
66423254
JH
1078 !ibcmp_utf8((char *) foldbuf,
1079 (char **)0, foldlen, do_utf8,
d07ddd77 1080 m,
eb160463 1081 (char **)0, ln, (bool)UTF))
80aecb99
JH
1082 && (norun || regtry(prog, s)) )
1083 goto got_it;
1084 }
1aa99e6b
IH
1085 s += len;
1086 }
09091399
JH
1087 }
1088 else {
1aa99e6b 1089 while (s <= e) {
872c91ae
JH
1090 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1091 ckWARN(WARN_UTF8) ?
1092 0 : UTF8_ALLOW_ANY);
80aecb99 1093
60a8b682 1094 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1095 * Note that not all the possible combinations
1096 * are handled here: some of them are handled
1097 * by the standard folding rules, and some of
1098 * them (the character class or ANYOF cases)
1099 * are handled during compiletime in
1100 * regexec.c:S_regclass(). */
880bd946
JH
1101 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1102 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1103 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1104
1105 if ( (c == c1 || c == c2)
1106 && (ln == len ||
66423254 1107 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1108 m, (char **)0, ln, (bool)UTF))
55da9344 1109 && (norun || regtry(prog, s)) )
1aa99e6b 1110 goto got_it;
80aecb99
JH
1111 else {
1112 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1113 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1114 if ( f != c
1115 && (f == c1 || f == c2)
1116 && (ln == foldlen ||
a6872d42 1117 !ibcmp_utf8((char *) foldbuf,
66423254 1118 (char **)0, foldlen, do_utf8,
d07ddd77 1119 m,
eb160463 1120 (char **)0, ln, (bool)UTF))
80aecb99
JH
1121 && (norun || regtry(prog, s)) )
1122 goto got_it;
1123 }
1aa99e6b
IH
1124 s += len;
1125 }
09091399 1126 }
1aa99e6b
IH
1127 }
1128 else {
1129 if (c1 == c2)
1130 while (s <= e) {
1131 if ( *(U8*)s == c1
1132 && (ln == 1 || !(OP(c) == EXACTF
1133 ? ibcmp(s, m, ln)
1134 : ibcmp_locale(s, m, ln)))
1135 && (norun || regtry(prog, s)) )
1136 goto got_it;
1137 s++;
1138 }
1139 else
1140 while (s <= e) {
1141 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1142 && (ln == 1 || !(OP(c) == EXACTF
1143 ? ibcmp(s, m, ln)
1144 : ibcmp_locale(s, m, ln)))
1145 && (norun || regtry(prog, s)) )
1146 goto got_it;
1147 s++;
1148 }
b3c9acc1
IZ
1149 }
1150 break;
bbce6d69 1151 case BOUNDL:
3280af22 1152 PL_reg_flags |= RF_tainted;
bbce6d69 1153 /* FALL THROUGH */
a0d0e21e 1154 case BOUND:
ffc61ed2 1155 if (do_utf8) {
12d33761 1156 if (s == PL_bostr)
ffc61ed2
JH
1157 tmp = '\n';
1158 else {
b4f7163a 1159 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1160
b4f7163a 1161 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1162 }
1163 tmp = ((OP(c) == BOUND ?
9041c2e3 1164 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1165 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1166 while (s < strend) {
1167 if (tmp == !(OP(c) == BOUND ?
3568d838 1168 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1169 isALNUM_LC_utf8((U8*)s)))
1170 {
1171 tmp = !tmp;
1172 if ((norun || regtry(prog, s)))
1173 goto got_it;
1174 }
1175 s += UTF8SKIP(s);
a687059c 1176 }
a0d0e21e 1177 }
667bb95a 1178 else {
12d33761 1179 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1180 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1181 while (s < strend) {
1182 if (tmp ==
1183 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1184 tmp = !tmp;
1185 if ((norun || regtry(prog, s)))
1186 goto got_it;
1187 }
1188 s++;
a0ed51b3 1189 }
a0ed51b3 1190 }
6eb5f6b9 1191 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1192 goto got_it;
1193 break;
bbce6d69 1194 case NBOUNDL:
3280af22 1195 PL_reg_flags |= RF_tainted;
bbce6d69 1196 /* FALL THROUGH */
a0d0e21e 1197 case NBOUND:
ffc61ed2 1198 if (do_utf8) {
12d33761 1199 if (s == PL_bostr)
ffc61ed2
JH
1200 tmp = '\n';
1201 else {
b4f7163a 1202 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1203
b4f7163a 1204 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1205 }
1206 tmp = ((OP(c) == NBOUND ?
9041c2e3 1207 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1208 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1209 while (s < strend) {
1210 if (tmp == !(OP(c) == NBOUND ?
3568d838 1211 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1212 isALNUM_LC_utf8((U8*)s)))
1213 tmp = !tmp;
1214 else if ((norun || regtry(prog, s)))
1215 goto got_it;
1216 s += UTF8SKIP(s);
1217 }
a0d0e21e 1218 }
667bb95a 1219 else {
12d33761 1220 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1221 tmp = ((OP(c) == NBOUND ?
1222 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1223 while (s < strend) {
1224 if (tmp ==
1225 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1226 tmp = !tmp;
1227 else if ((norun || regtry(prog, s)))
1228 goto got_it;
1229 s++;
1230 }
a0ed51b3 1231 }
6eb5f6b9 1232 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1233 goto got_it;
1234 break;
a0d0e21e 1235 case ALNUM:
ffc61ed2 1236 if (do_utf8) {
8269fa76 1237 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1238 while (s < strend) {
3568d838 1239 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1240 if (tmp && (norun || regtry(prog, s)))
1241 goto got_it;
1242 else
1243 tmp = doevery;
1244 }
bbce6d69 1245 else
ffc61ed2
JH
1246 tmp = 1;
1247 s += UTF8SKIP(s);
bbce6d69 1248 }
bbce6d69 1249 }
ffc61ed2
JH
1250 else {
1251 while (s < strend) {
1252 if (isALNUM(*s)) {
1253 if (tmp && (norun || regtry(prog, s)))
1254 goto got_it;
1255 else
1256 tmp = doevery;
1257 }
a0ed51b3 1258 else
ffc61ed2
JH
1259 tmp = 1;
1260 s++;
a0ed51b3 1261 }
a0ed51b3
LW
1262 }
1263 break;
bbce6d69 1264 case ALNUML:
3280af22 1265 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1266 if (do_utf8) {
1267 while (s < strend) {
1268 if (isALNUM_LC_utf8((U8*)s)) {
1269 if (tmp && (norun || regtry(prog, s)))
1270 goto got_it;
1271 else
1272 tmp = doevery;
1273 }
a687059c 1274 else
ffc61ed2
JH
1275 tmp = 1;
1276 s += UTF8SKIP(s);
a0d0e21e 1277 }
a0d0e21e 1278 }
ffc61ed2
JH
1279 else {
1280 while (s < strend) {
1281 if (isALNUM_LC(*s)) {
1282 if (tmp && (norun || regtry(prog, s)))
1283 goto got_it;
1284 else
1285 tmp = doevery;
1286 }
a0ed51b3 1287 else
ffc61ed2
JH
1288 tmp = 1;
1289 s++;
a0ed51b3 1290 }
a0ed51b3
LW
1291 }
1292 break;
a0d0e21e 1293 case NALNUM:
ffc61ed2 1294 if (do_utf8) {
8269fa76 1295 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1296 while (s < strend) {
3568d838 1297 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1298 if (tmp && (norun || regtry(prog, s)))
1299 goto got_it;
1300 else
1301 tmp = doevery;
1302 }
bbce6d69 1303 else
ffc61ed2
JH
1304 tmp = 1;
1305 s += UTF8SKIP(s);
bbce6d69 1306 }
bbce6d69 1307 }
ffc61ed2
JH
1308 else {
1309 while (s < strend) {
1310 if (!isALNUM(*s)) {
1311 if (tmp && (norun || regtry(prog, s)))
1312 goto got_it;
1313 else
1314 tmp = doevery;
1315 }
a0ed51b3 1316 else
ffc61ed2
JH
1317 tmp = 1;
1318 s++;
a0ed51b3 1319 }
a0ed51b3
LW
1320 }
1321 break;
bbce6d69 1322 case NALNUML:
3280af22 1323 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1324 if (do_utf8) {
1325 while (s < strend) {
1326 if (!isALNUM_LC_utf8((U8*)s)) {
1327 if (tmp && (norun || regtry(prog, s)))
1328 goto got_it;
1329 else
1330 tmp = doevery;
1331 }
a687059c 1332 else
ffc61ed2
JH
1333 tmp = 1;
1334 s += UTF8SKIP(s);
a687059c 1335 }
a0d0e21e 1336 }
ffc61ed2
JH
1337 else {
1338 while (s < strend) {
1339 if (!isALNUM_LC(*s)) {
1340 if (tmp && (norun || regtry(prog, s)))
1341 goto got_it;
1342 else
1343 tmp = doevery;
1344 }
a0ed51b3 1345 else
ffc61ed2
JH
1346 tmp = 1;
1347 s++;
a0ed51b3 1348 }
a0ed51b3
LW
1349 }
1350 break;
a0d0e21e 1351 case SPACE:
ffc61ed2 1352 if (do_utf8) {
8269fa76 1353 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1354 while (s < strend) {
3568d838 1355 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1356 if (tmp && (norun || regtry(prog, s)))
1357 goto got_it;
1358 else
1359 tmp = doevery;
1360 }
a0d0e21e 1361 else
ffc61ed2
JH
1362 tmp = 1;
1363 s += UTF8SKIP(s);
2304df62 1364 }
a0d0e21e 1365 }
ffc61ed2
JH
1366 else {
1367 while (s < strend) {
1368 if (isSPACE(*s)) {
1369 if (tmp && (norun || regtry(prog, s)))
1370 goto got_it;
1371 else
1372 tmp = doevery;
1373 }
a0ed51b3 1374 else
ffc61ed2
JH
1375 tmp = 1;
1376 s++;
a0ed51b3 1377 }
a0ed51b3
LW
1378 }
1379 break;
bbce6d69 1380 case SPACEL:
3280af22 1381 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1382 if (do_utf8) {
1383 while (s < strend) {
1384 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1385 if (tmp && (norun || regtry(prog, s)))
1386 goto got_it;
1387 else
1388 tmp = doevery;
1389 }
bbce6d69 1390 else
ffc61ed2
JH
1391 tmp = 1;
1392 s += UTF8SKIP(s);
bbce6d69 1393 }
bbce6d69 1394 }
ffc61ed2
JH
1395 else {
1396 while (s < strend) {
1397 if (isSPACE_LC(*s)) {
1398 if (tmp && (norun || regtry(prog, s)))
1399 goto got_it;
1400 else
1401 tmp = doevery;
1402 }
a0ed51b3 1403 else
ffc61ed2
JH
1404 tmp = 1;
1405 s++;
a0ed51b3 1406 }
a0ed51b3
LW
1407 }
1408 break;
a0d0e21e 1409 case NSPACE:
ffc61ed2 1410 if (do_utf8) {
8269fa76 1411 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1412 while (s < strend) {
3568d838 1413 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1414 if (tmp && (norun || regtry(prog, s)))
1415 goto got_it;
1416 else
1417 tmp = doevery;
1418 }
a0d0e21e 1419 else
ffc61ed2
JH
1420 tmp = 1;
1421 s += UTF8SKIP(s);
a687059c 1422 }
a0d0e21e 1423 }
ffc61ed2
JH
1424 else {
1425 while (s < strend) {
1426 if (!isSPACE(*s)) {
1427 if (tmp && (norun || regtry(prog, s)))
1428 goto got_it;
1429 else
1430 tmp = doevery;
1431 }
a0ed51b3 1432 else
ffc61ed2
JH
1433 tmp = 1;
1434 s++;
a0ed51b3 1435 }
a0ed51b3
LW
1436 }
1437 break;
bbce6d69 1438 case NSPACEL:
3280af22 1439 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1440 if (do_utf8) {
1441 while (s < strend) {
1442 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1443 if (tmp && (norun || regtry(prog, s)))
1444 goto got_it;
1445 else
1446 tmp = doevery;
1447 }
bbce6d69 1448 else
ffc61ed2
JH
1449 tmp = 1;
1450 s += UTF8SKIP(s);
bbce6d69 1451 }
bbce6d69 1452 }
ffc61ed2
JH
1453 else {
1454 while (s < strend) {
1455 if (!isSPACE_LC(*s)) {
1456 if (tmp && (norun || regtry(prog, s)))
1457 goto got_it;
1458 else
1459 tmp = doevery;
1460 }
a0ed51b3 1461 else
ffc61ed2
JH
1462 tmp = 1;
1463 s++;
a0ed51b3 1464 }
a0ed51b3
LW
1465 }
1466 break;
a0d0e21e 1467 case DIGIT:
ffc61ed2 1468 if (do_utf8) {
8269fa76 1469 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1470 while (s < strend) {
3568d838 1471 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1472 if (tmp && (norun || regtry(prog, s)))
1473 goto got_it;
1474 else
1475 tmp = doevery;
1476 }
a0d0e21e 1477 else
ffc61ed2
JH
1478 tmp = 1;
1479 s += UTF8SKIP(s);
2b69d0c2 1480 }
a0d0e21e 1481 }
ffc61ed2
JH
1482 else {
1483 while (s < strend) {
1484 if (isDIGIT(*s)) {
1485 if (tmp && (norun || regtry(prog, s)))
1486 goto got_it;
1487 else
1488 tmp = doevery;
1489 }
a0ed51b3 1490 else
ffc61ed2
JH
1491 tmp = 1;
1492 s++;
a0ed51b3 1493 }
a0ed51b3
LW
1494 }
1495 break;
b8c5462f
JH
1496 case DIGITL:
1497 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1498 if (do_utf8) {
1499 while (s < strend) {
1500 if (isDIGIT_LC_utf8((U8*)s)) {
1501 if (tmp && (norun || regtry(prog, s)))
1502 goto got_it;
1503 else
1504 tmp = doevery;
1505 }
b8c5462f 1506 else
ffc61ed2
JH
1507 tmp = 1;
1508 s += UTF8SKIP(s);
b8c5462f 1509 }
b8c5462f 1510 }
ffc61ed2
JH
1511 else {
1512 while (s < strend) {
1513 if (isDIGIT_LC(*s)) {
1514 if (tmp && (norun || regtry(prog, s)))
1515 goto got_it;
1516 else
1517 tmp = doevery;
1518 }
b8c5462f 1519 else
ffc61ed2
JH
1520 tmp = 1;
1521 s++;
b8c5462f 1522 }
b8c5462f
JH
1523 }
1524 break;
a0d0e21e 1525 case NDIGIT:
ffc61ed2 1526 if (do_utf8) {
8269fa76 1527 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1528 while (s < strend) {
3568d838 1529 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1530 if (tmp && (norun || regtry(prog, s)))
1531 goto got_it;
1532 else
1533 tmp = doevery;
1534 }
a0d0e21e 1535 else
ffc61ed2
JH
1536 tmp = 1;
1537 s += UTF8SKIP(s);
a687059c 1538 }
a0d0e21e 1539 }
ffc61ed2
JH
1540 else {
1541 while (s < strend) {
1542 if (!isDIGIT(*s)) {
1543 if (tmp && (norun || regtry(prog, s)))
1544 goto got_it;
1545 else
1546 tmp = doevery;
1547 }
a0ed51b3 1548 else
ffc61ed2
JH
1549 tmp = 1;
1550 s++;
a0ed51b3 1551 }
a0ed51b3
LW
1552 }
1553 break;
b8c5462f
JH
1554 case NDIGITL:
1555 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1556 if (do_utf8) {
1557 while (s < strend) {
1558 if (!isDIGIT_LC_utf8((U8*)s)) {
1559 if (tmp && (norun || regtry(prog, s)))
1560 goto got_it;
1561 else
1562 tmp = doevery;
1563 }
b8c5462f 1564 else
ffc61ed2
JH
1565 tmp = 1;
1566 s += UTF8SKIP(s);
b8c5462f 1567 }
a0ed51b3 1568 }
ffc61ed2
JH
1569 else {
1570 while (s < strend) {
1571 if (!isDIGIT_LC(*s)) {
1572 if (tmp && (norun || regtry(prog, s)))
1573 goto got_it;
1574 else
1575 tmp = doevery;
1576 }
cf93c79d 1577 else
ffc61ed2
JH
1578 tmp = 1;
1579 s++;
b8c5462f 1580 }
b8c5462f
JH
1581 }
1582 break;
b3c9acc1 1583 default:
3c3eec57
GS
1584 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1585 break;
d6a28714 1586 }
6eb5f6b9
JH
1587 return 0;
1588 got_it:
1589 return s;
1590}
1591
1592/*
1593 - regexec_flags - match a regexp against a string
1594 */
1595I32
1596Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1597 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1598/* strend: pointer to null at end of string */
1599/* strbeg: real beginning of string */
1600/* minend: end of match must be >=minend after stringarg. */
1601/* data: May be used for some additional optimizations. */
1602/* nosave: For optimizations. */
1603{
6eb5f6b9
JH
1604 register char *s;
1605 register regnode *c;
1606 register char *startpos = stringarg;
6eb5f6b9
JH
1607 I32 minlen; /* must match at least this many chars */
1608 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1609 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1610 constant substr. */ /* CC */
1611 I32 end_shift = 0; /* Same for the end. */ /* CC */
1612 I32 scream_pos = -1; /* Internal iterator of scream. */
1613 char *scream_olds;
1614 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1615 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1616#ifdef DEBUGGING
9e55ce06
JH
1617 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1618 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1619#endif
a30b2f1f 1620 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1621
1622 PL_regcc = 0;
1623
1624 cache_re(prog);
1625#ifdef DEBUGGING
aea4f609 1626 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1627#endif
1628
1629 /* Be paranoid... */
1630 if (prog == NULL || startpos == NULL) {
1631 Perl_croak(aTHX_ "NULL regexp parameter");
1632 return 0;
1633 }
1634
1635 minlen = prog->minlen;
61a36c01 1636 if (strend - startpos < minlen) {
a72c7584
JH
1637 DEBUG_r(PerlIO_printf(Perl_debug_log,
1638 "String too short [regexec_flags]...\n"));
1639 goto phooey;
1aa99e6b 1640 }
6eb5f6b9 1641
6eb5f6b9
JH
1642 /* Check validity of program. */
1643 if (UCHARAT(prog->program) != REG_MAGIC) {
1644 Perl_croak(aTHX_ "corrupted regexp program");
1645 }
1646
1647 PL_reg_flags = 0;
1648 PL_reg_eval_set = 0;
1649 PL_reg_maxiter = 0;
1650
1651 if (prog->reganch & ROPT_UTF8)
1652 PL_reg_flags |= RF_utf8;
1653
1654 /* Mark beginning of line for ^ and lookbehind. */
1655 PL_regbol = startpos;
1656 PL_bostr = strbeg;
1657 PL_reg_sv = sv;
1658
1659 /* Mark end of line for $ (and such) */
1660 PL_regeol = strend;
1661
1662 /* see how far we have to get to not match where we matched before */
1663 PL_regtill = startpos+minend;
1664
1665 /* We start without call_cc context. */
1666 PL_reg_call_cc = 0;
1667
1668 /* If there is a "must appear" string, look for it. */
1669 s = startpos;
1670
1671 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1672 MAGIC *mg;
1673
1674 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1675 PL_reg_ganch = startpos;
1676 else if (sv && SvTYPE(sv) >= SVt_PVMG
1677 && SvMAGIC(sv)
14befaf4
DM
1678 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1679 && mg->mg_len >= 0) {
6eb5f6b9
JH
1680 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1681 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1682 if (s > PL_reg_ganch)
6eb5f6b9
JH
1683 goto phooey;
1684 s = PL_reg_ganch;
1685 }
1686 }
1687 else /* pos() not defined */
1688 PL_reg_ganch = strbeg;
1689 }
1690
33b8afdf 1691 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1692 re_scream_pos_data d;
1693
1694 d.scream_olds = &scream_olds;
1695 d.scream_pos = &scream_pos;
1696 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1697 if (!s) {
1698 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1699 goto phooey; /* not present */
3fa9c3d7 1700 }
6eb5f6b9
JH
1701 }
1702
2a782b5b 1703 DEBUG_r({
9e55ce06
JH
1704 char *s0 = UTF ?
1705 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1706 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1707 prog->precomp;
1708 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1709 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1710 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1711 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1712 if (!PL_colorset)
1713 reginitcolors();
1714 PerlIO_printf(Perl_debug_log,
9e55ce06 1715 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1716 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1717 len0, len0, s0,
2a782b5b 1718 PL_colors[1],
9e55ce06 1719 len0 > 60 ? "..." : "",
2a782b5b 1720 PL_colors[0],
9e55ce06
JH
1721 (int)(len1 > 60 ? 60 : len1),
1722 s1, PL_colors[1],
1723 (len1 > 60 ? "..." : "")
2a782b5b
JH
1724 );
1725 });
6eb5f6b9
JH
1726
1727 /* Simplest case: anchored match need be tried only once. */
1728 /* [unless only anchor is BOL and multiline is set] */
1729 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1730 if (s == startpos && regtry(prog, startpos))
1731 goto got_it;
1732 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1733 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1734 {
1735 char *end;
1736
1737 if (minlen)
1738 dontbother = minlen - 1;
1aa99e6b 1739 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1740 /* for multiline we only have to try after newlines */
33b8afdf 1741 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1742 if (s == startpos)
1743 goto after_try;
1744 while (1) {
1745 if (regtry(prog, s))
1746 goto got_it;
1747 after_try:
1748 if (s >= end)
1749 goto phooey;
1750 if (prog->reganch & RE_USE_INTUIT) {
1751 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1752 if (!s)
1753 goto phooey;
1754 }
1755 else
1756 s++;
1757 }
1758 } else {
1759 if (s > startpos)
1760 s--;
1761 while (s < end) {
1762 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1763 if (regtry(prog, s))
1764 goto got_it;
1765 }
1766 }
1767 }
1768 }
1769 goto phooey;
1770 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1771 if (regtry(prog, PL_reg_ganch))
1772 goto got_it;
1773 goto phooey;
1774 }
1775
1776 /* Messy cases: unanchored match. */
33b8afdf 1777 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1778 /* we have /x+whatever/ */
1779 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1780 char ch;
bf93d4cc
GS
1781#ifdef DEBUGGING
1782 int did_match = 0;
1783#endif
33b8afdf
JH
1784 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1785 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1786 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1787
1aa99e6b 1788 if (do_utf8) {
6eb5f6b9
JH
1789 while (s < strend) {
1790 if (*s == ch) {
bf93d4cc 1791 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1792 if (regtry(prog, s)) goto got_it;
1793 s += UTF8SKIP(s);
1794 while (s < strend && *s == ch)
1795 s += UTF8SKIP(s);
1796 }
1797 s += UTF8SKIP(s);
1798 }
1799 }
1800 else {
1801 while (s < strend) {
1802 if (*s == ch) {
bf93d4cc 1803 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1804 if (regtry(prog, s)) goto got_it;
1805 s++;
1806 while (s < strend && *s == ch)
1807 s++;
1808 }
1809 s++;
1810 }
1811 }
b7953727 1812 DEBUG_r(if (!did_match)
bf93d4cc 1813 PerlIO_printf(Perl_debug_log,
b7953727
JH
1814 "Did not find anchored character...\n")
1815 );
6eb5f6b9
JH
1816 }
1817 /*SUPPRESS 560*/
33b8afdf
JH
1818 else if (prog->anchored_substr != Nullsv
1819 || prog->anchored_utf8 != Nullsv
1820 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1821 && prog->float_max_offset < strend - s)) {
1822 SV *must;
1823 I32 back_max;
1824 I32 back_min;
1825 char *last;
6eb5f6b9 1826 char *last1; /* Last position checked before */
bf93d4cc
GS
1827#ifdef DEBUGGING
1828 int did_match = 0;
1829#endif
33b8afdf
JH
1830 if (prog->anchored_substr || prog->anchored_utf8) {
1831 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1832 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1833 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1834 back_max = back_min = prog->anchored_offset;
1835 } else {
1836 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1837 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1838 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1839 back_max = prog->float_max_offset;
1840 back_min = prog->float_min_offset;
1841 }
1842 if (must == &PL_sv_undef)
1843 /* could not downgrade utf8 check substring, so must fail */
1844 goto phooey;
1845
1846 last = HOP3c(strend, /* Cannot start after this */
1847 -(I32)(CHR_SVLEN(must)
1848 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1849
1850 if (s > PL_bostr)
1851 last1 = HOPc(s, -1);
1852 else
1853 last1 = s - 1; /* bogus */
1854
1855 /* XXXX check_substr already used to find `s', can optimize if
1856 check_substr==must. */
1857 scream_pos = -1;
1858 dontbother = end_shift;
1859 strend = HOPc(strend, -dontbother);
1860 while ( (s <= last) &&
9041c2e3 1861 ((flags & REXEC_SCREAM)
1aa99e6b 1862 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1863 end_shift, &scream_pos, 0))
1aa99e6b 1864 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1865 (unsigned char*)strend, must,
6eb5f6b9 1866 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1867 /* we may be pointing at the wrong string */
1868 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1869 s = strbeg + (s - SvPVX(sv));
bf93d4cc 1870 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1871 if (HOPc(s, -back_max) > last1) {
1872 last1 = HOPc(s, -back_min);
1873 s = HOPc(s, -back_max);
1874 }
1875 else {
1876 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1877
1878 last1 = HOPc(s, -back_min);
1879 s = t;
1880 }
1aa99e6b 1881 if (do_utf8) {
6eb5f6b9
JH
1882 while (s <= last1) {
1883 if (regtry(prog, s))
1884 goto got_it;
1885 s += UTF8SKIP(s);
1886 }
1887 }
1888 else {
1889 while (s <= last1) {
1890 if (regtry(prog, s))
1891 goto got_it;
1892 s++;
1893 }
1894 }
1895 }
b7953727
JH
1896 DEBUG_r(if (!did_match)
1897 PerlIO_printf(Perl_debug_log,
1898 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1899 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1900 ? "anchored" : "floating"),
1901 PL_colors[0],
1902 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1903 SvPVX(must),
b7953727
JH
1904 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1905 );
6eb5f6b9
JH
1906 goto phooey;
1907 }
155aba94 1908 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1909 if (minlen) {
1910 I32 op = (U8)OP(prog->regstclass);
66e933ab 1911 /* don't bother with what can't match */
f14c76ed
RGS
1912 if (PL_regkind[op] != EXACT && op != CANY)
1913 strend = HOPc(strend, -(minlen - 1));
1914 }
ffc61ed2
JH
1915 DEBUG_r({
1916 SV *prop = sv_newmortal();
9e55ce06
JH
1917 char *s0;
1918 char *s1;
1919 int len0;
1920 int len1;
1921
ffc61ed2 1922 regprop(prop, c);
9e55ce06
JH
1923 s0 = UTF ?
1924 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1925 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1926 SvPVX(prop);
1927 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1928 s1 = UTF ?
c728cb41 1929 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1930 len1 = UTF ? SvCUR(dsv1) : strend - s;
1931 PerlIO_printf(Perl_debug_log,
1932 "Matching stclass `%*.*s' against `%*.*s'\n",
1933 len0, len0, s0,
1934 len1, len1, s1);
ffc61ed2 1935 });
6eb5f6b9
JH
1936 if (find_byclass(prog, c, s, strend, startpos, 0))
1937 goto got_it;
bf93d4cc 1938 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1939 }
1940 else {
1941 dontbother = 0;
33b8afdf
JH
1942 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1943 /* Trim the end. */
d6a28714 1944 char *last;
33b8afdf
JH
1945 SV* float_real;
1946
1947 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1948 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1949 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1950
1951 if (flags & REXEC_SCREAM) {
33b8afdf 1952 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1953 end_shift, &scream_pos, 1); /* last one */
1954 if (!last)
ffc61ed2 1955 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1956 /* we may be pointing at the wrong string */
1957 else if (RX_MATCH_COPIED(prog))
7ef91622 1958 s = strbeg + (s - SvPVX(sv));
b8c5462f 1959 }
d6a28714
JH
1960 else {
1961 STRLEN len;
33b8afdf 1962 char *little = SvPV(float_real, len);
d6a28714 1963
33b8afdf 1964 if (SvTAIL(float_real)) {
d6a28714
JH
1965 if (memEQ(strend - len + 1, little, len - 1))
1966 last = strend - len + 1;
1967 else if (!PL_multiline)
9041c2e3 1968 last = memEQ(strend - len, little, len)
d6a28714 1969 ? strend - len : Nullch;
b8c5462f 1970 else
d6a28714
JH
1971 goto find_last;
1972 } else {
1973 find_last:
9041c2e3 1974 if (len)
d6a28714 1975 last = rninstr(s, strend, little, little + len);
b8c5462f 1976 else
d6a28714 1977 last = strend; /* matching `$' */
b8c5462f 1978 }
b8c5462f 1979 }
bf93d4cc
GS
1980 if (last == NULL) {
1981 DEBUG_r(PerlIO_printf(Perl_debug_log,
1982 "%sCan't trim the tail, match fails (should not happen)%s\n",
1983 PL_colors[4],PL_colors[5]));
1984 goto phooey; /* Should not happen! */
1985 }
d6a28714
JH
1986 dontbother = strend - last + prog->float_min_offset;
1987 }
1988 if (minlen && (dontbother < minlen))
1989 dontbother = minlen - 1;
1990 strend -= dontbother; /* this one's always in bytes! */
1991 /* We don't know much -- general case. */
1aa99e6b 1992 if (do_utf8) {
d6a28714
JH
1993 for (;;) {
1994 if (regtry(prog, s))
1995 goto got_it;
1996 if (s >= strend)
1997 break;
b8c5462f 1998 s += UTF8SKIP(s);
d6a28714
JH
1999 };
2000 }
2001 else {
2002 do {
2003 if (regtry(prog, s))
2004 goto got_it;
2005 } while (s++ < strend);
2006 }
2007 }
2008
2009 /* Failure. */
2010 goto phooey;
2011
2012got_it:
2013 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2014
2015 if (PL_reg_eval_set) {
2016 /* Preserve the current value of $^R */
2017 if (oreplsv != GvSV(PL_replgv))
2018 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2019 restored, the value remains
2020 the same. */
acfe0abc 2021 restore_pos(aTHX_ 0);
d6a28714
JH
2022 }
2023
2024 /* make sure $`, $&, $', and $digit will work later */
2025 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2026 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2027 if (flags & REXEC_COPY_STR) {
2028 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2029#ifdef PERL_COPY_ON_WRITE
2030 if ((SvIsCOW(sv)
2031 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2032 if (DEBUG_C_TEST) {
2033 PerlIO_printf(Perl_debug_log,
2034 "Copy on write: regexp capture, type %d\n",
2035 (int) SvTYPE(sv));
2036 }
2037 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2038 prog->subbeg = SvPVX(prog->saved_copy);
2039 assert (SvPOKp(prog->saved_copy));
2040 } else
2041#endif
2042 {
2043 RX_MATCH_COPIED_on(prog);
2044 s = savepvn(strbeg, i);
2045 prog->subbeg = s;
2046 }
d6a28714 2047 prog->sublen = i;
d6a28714
JH
2048 }
2049 else {
2050 prog->subbeg = strbeg;
2051 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2052 }
2053 }
9041c2e3 2054
d6a28714
JH
2055 return 1;
2056
2057phooey:
bf93d4cc
GS
2058 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2059 PL_colors[4],PL_colors[5]));
d6a28714 2060 if (PL_reg_eval_set)
acfe0abc 2061 restore_pos(aTHX_ 0);
d6a28714
JH
2062 return 0;
2063}
2064
2065/*
2066 - regtry - try match at specific point
2067 */
2068STATIC I32 /* 0 failure, 1 success */
2069S_regtry(pTHX_ regexp *prog, char *startpos)
2070{
d6a28714
JH
2071 register I32 i;
2072 register I32 *sp;
2073 register I32 *ep;
2074 CHECKPOINT lastcp;
2075
02db2b7b
IZ
2076#ifdef DEBUGGING
2077 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2078#endif
d6a28714
JH
2079 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2080 MAGIC *mg;
2081
2082 PL_reg_eval_set = RS_init;
2083 DEBUG_r(DEBUG_s(
b900a521
JH
2084 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2085 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2086 ));
e8347627 2087 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2088 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2089 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2090 SAVETMPS;
2091 /* Apparently this is not needed, judging by wantarray. */
e8347627 2092 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2093 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2094
2095 if (PL_reg_sv) {
2096 /* Make $_ available to executed code. */
2097 if (PL_reg_sv != DEFSV) {
4d1ff10f 2098 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2099 SAVESPTR(DEFSV);
2100 DEFSV = PL_reg_sv;
b8c5462f 2101 }
d6a28714 2102
9041c2e3 2103 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2104 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2105 /* prepare for quick setting of pos */
14befaf4
DM
2106 sv_magic(PL_reg_sv, (SV*)0,
2107 PERL_MAGIC_regex_global, Nullch, 0);
2108 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2109 mg->mg_len = -1;
b8c5462f 2110 }
d6a28714
JH
2111 PL_reg_magic = mg;
2112 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2113 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2114 }
09687e5a 2115 if (!PL_reg_curpm) {
0f79a09d 2116 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2117#ifdef USE_ITHREADS
2118 {
2119 SV* repointer = newSViv(0);
577e12cc 2120 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2121 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2122 av_push(PL_regex_padav,repointer);
2123 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2124 PL_regex_pad = AvARRAY(PL_regex_padav);
2125 }
2126#endif
2127 }
aaa362c4 2128 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2129 PL_reg_oldcurpm = PL_curpm;
2130 PL_curpm = PL_reg_curpm;
2131 if (RX_MATCH_COPIED(prog)) {
2132 /* Here is a serious problem: we cannot rewrite subbeg,
2133 since it may be needed if this match fails. Thus
2134 $` inside (?{}) could fail... */
2135 PL_reg_oldsaved = prog->subbeg;
2136 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2137#ifdef PERL_COPY_ON_WRITE
2138 PL_nrs = prog->saved_copy;
2139#endif
d6a28714
JH
2140 RX_MATCH_COPIED_off(prog);
2141 }
2142 else
2143 PL_reg_oldsaved = Nullch;
2144 prog->subbeg = PL_bostr;
2145 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2146 }
2147 prog->startp[0] = startpos - PL_bostr;
2148 PL_reginput = startpos;
2149 PL_regstartp = prog->startp;
2150 PL_regendp = prog->endp;
2151 PL_reglastparen = &prog->lastparen;
a01268b5 2152 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2153 prog->lastparen = 0;
03994de8 2154 prog->lastcloseparen = 0;
d6a28714
JH
2155 PL_regsize = 0;
2156 DEBUG_r(PL_reg_starttry = startpos);
2157 if (PL_reg_start_tmpl <= prog->nparens) {
2158 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2159 if(PL_reg_start_tmp)
2160 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2161 else
2162 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2163 }
2164
2165 /* XXXX What this code is doing here?!!! There should be no need
2166 to do this again and again, PL_reglastparen should take care of
3dd2943c 2167 this! --ilya*/
dafc8851
JH
2168
2169 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2170 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2171 * PL_reglastparen), is not needed at all by the test suite
2172 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2173 * enough, for building DynaLoader, or otherwise this
2174 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2175 * will happen. Meanwhile, this code *is* needed for the
2176 * above-mentioned test suite tests to succeed. The common theme
2177 * on those tests seems to be returning null fields from matches.
2178 * --jhi */
dafc8851 2179#if 1
d6a28714
JH
2180 sp = prog->startp;
2181 ep = prog->endp;
2182 if (prog->nparens) {
eb160463 2183 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2184 *++sp = -1;
2185 *++ep = -1;
2186 }
2187 }
dafc8851 2188#endif
02db2b7b 2189 REGCP_SET(lastcp);
d6a28714
JH
2190 if (regmatch(prog->program + 1)) {
2191 prog->endp[0] = PL_reginput - PL_bostr;
2192 return 1;
2193 }
02db2b7b 2194 REGCP_UNWIND(lastcp);
d6a28714
JH
2195 return 0;
2196}
2197
02db2b7b
IZ
2198#define RE_UNWIND_BRANCH 1
2199#define RE_UNWIND_BRANCHJ 2
2200
2201union re_unwind_t;
2202
2203typedef struct { /* XX: makes sense to enlarge it... */
2204 I32 type;
2205 I32 prev;
2206 CHECKPOINT lastcp;
2207} re_unwind_generic_t;
2208
2209typedef struct {
2210 I32 type;
2211 I32 prev;
2212 CHECKPOINT lastcp;
2213 I32 lastparen;
2214 regnode *next;
2215 char *locinput;
2216 I32 nextchr;
2217#ifdef DEBUGGING
2218 int regindent;
2219#endif
2220} re_unwind_branch_t;
2221
2222typedef union re_unwind_t {
2223 I32 type;
2224 re_unwind_generic_t generic;
2225 re_unwind_branch_t branch;
2226} re_unwind_t;
2227
8ba1375e
MJD
2228#define sayYES goto yes
2229#define sayNO goto no
e0f9d4a8 2230#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2231#define sayYES_FINAL goto yes_final
2232#define sayYES_LOUD goto yes_loud
2233#define sayNO_FINAL goto no_final
2234#define sayNO_SILENT goto do_no
2235#define saySAME(x) if (x) goto yes; else goto no
2236
2237#define REPORT_CODE_OFF 24
2238
d6a28714
JH
2239/*
2240 - regmatch - main matching routine
2241 *
2242 * Conceptually the strategy is simple: check to see whether the current
2243 * node matches, call self recursively to see whether the rest matches,
2244 * and then act accordingly. In practice we make some effort to avoid
2245 * recursion, in particular by going through "ordinary" nodes (that don't
2246 * need to know whether the rest of the match failed) by a loop instead of
2247 * by recursion.
2248 */
2249/* [lwall] I've hoisted the register declarations to the outer block in order to
2250 * maybe save a little bit of pushing and popping on the stack. It also takes
2251 * advantage of machines that use a register save mask on subroutine entry.
2252 */
2253STATIC I32 /* 0 failure, 1 success */
2254S_regmatch(pTHX_ regnode *prog)
2255{
d6a28714
JH
2256 register regnode *scan; /* Current node. */
2257 regnode *next; /* Next node. */
2258 regnode *inner; /* Next node in internal branch. */
2259 register I32 nextchr; /* renamed nextchr - nextchar colides with
2260 function of same name */
2261 register I32 n; /* no or next */
b7953727
JH
2262 register I32 ln = 0; /* len or last */
2263 register char *s = Nullch; /* operand or save */
d6a28714 2264 register char *locinput = PL_reginput;
b7953727 2265 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2266 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2267 I32 unwind = 0;
b7953727 2268#if 0
02db2b7b 2269 I32 firstcp = PL_savestack_ix;
b7953727 2270#endif
53c4c00c 2271 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2272#ifdef DEBUGGING
ce333219
JH
2273 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2274 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2275 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2276#endif
02db2b7b 2277
d6a28714
JH
2278#ifdef DEBUGGING
2279 PL_regindent++;
2280#endif
2281
2282 /* Note that nextchr is a byte even in UTF */
2283 nextchr = UCHARAT(locinput);
2284 scan = prog;
2285 while (scan != NULL) {
8ba1375e 2286
2a782b5b 2287 DEBUG_r( {
d6a28714
JH
2288 SV *prop = sv_newmortal();
2289 int docolor = *PL_colors[0];
2290 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2291 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2292 /* The part of the string before starttry has one color
2293 (pref0_len chars), between starttry and current
2294 position another one (pref_len - pref0_len chars),
2295 after the current position the third one.
2296 We assume that pref0_len <= pref_len, otherwise we
2297 decrease pref0_len. */
9041c2e3 2298 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2299 ? (5 + taill) - l : locinput - PL_bostr;
2300 int pref0_len;
d6a28714 2301
df1ffd02 2302 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2303 pref_len++;
2304 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2305 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2306 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2307 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2308 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2309 l--;
d6a28714
JH
2310 if (pref0_len < 0)
2311 pref0_len = 0;
2312 if (pref0_len > pref_len)
2313 pref0_len = pref_len;
2314 regprop(prop, scan);
2a782b5b
JH
2315 {
2316 char *s0 =
f14c76ed 2317 do_utf8 && OP(scan) != CANY ?
2a782b5b 2318 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2319 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2320 locinput - pref_len;
df1ffd02 2321 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2322 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2323 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2324 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2325 locinput - pref_len + pref0_len;
df1ffd02 2326 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2327 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2328 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2329 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2330 locinput;
df1ffd02 2331 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2332 PerlIO_printf(Perl_debug_log,
2333 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2334 (IV)(locinput - PL_bostr),
2335 PL_colors[4],
2336 len0, s0,
2337 PL_colors[5],
2338 PL_colors[2],
2339 len1, s1,
2340 PL_colors[3],
2341 (docolor ? "" : "> <"),
2342 PL_colors[0],
2343 len2, s2,
2344 PL_colors[1],
2345 15 - l - pref_len + 1,
2346 "",
2347 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2348 SvPVX(prop));
2349 }
2350 });
d6a28714
JH
2351
2352 next = scan + NEXT_OFF(scan);
2353 if (next == scan)
2354 next = NULL;
2355
2356 switch (OP(scan)) {
2357 case BOL:
12d33761
HS
2358 if (locinput == PL_bostr || (PL_multiline &&
2359 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2360 {
2361 /* regtill = regbol; */
b8c5462f
JH
2362 break;
2363 }
d6a28714
JH
2364 sayNO;
2365 case MBOL:
12d33761
HS
2366 if (locinput == PL_bostr ||
2367 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2368 {
b8c5462f
JH
2369 break;
2370 }
d6a28714
JH
2371 sayNO;
2372 case SBOL:
c2a73568 2373 if (locinput == PL_bostr)
b8c5462f 2374 break;
d6a28714
JH
2375 sayNO;
2376 case GPOS:
2377 if (locinput == PL_reg_ganch)
2378 break;
2379 sayNO;
2380 case EOL:
2381 if (PL_multiline)
2382 goto meol;
2383 else
2384 goto seol;
2385 case MEOL:
2386 meol:
2387 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2388 sayNO;
b8c5462f 2389 break;
d6a28714
JH
2390 case SEOL:
2391 seol:
2392 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2393 sayNO;
d6a28714 2394 if (PL_regeol - locinput > 1)
b8c5462f 2395 sayNO;
b8c5462f 2396 break;
d6a28714
JH
2397 case EOS:
2398 if (PL_regeol != locinput)
b8c5462f 2399 sayNO;
d6a28714 2400 break;
ffc61ed2 2401 case SANY:
d6a28714 2402 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2403 sayNO;
f33976b4
DB
2404 if (do_utf8) {
2405 locinput += PL_utf8skip[nextchr];
2406 if (locinput > PL_regeol)
2407 sayNO;
2408 nextchr = UCHARAT(locinput);
2409 }
2410 else
2411 nextchr = UCHARAT(++locinput);
2412 break;
2413 case CANY:
2414 if (!nextchr && locinput >= PL_regeol)
2415 sayNO;
b8c5462f 2416 nextchr = UCHARAT(++locinput);
a0d0e21e 2417 break;
ffc61ed2 2418 case REG_ANY:
1aa99e6b
IH
2419 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2420 sayNO;
2421 if (do_utf8) {
b8c5462f 2422 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2423 if (locinput > PL_regeol)
2424 sayNO;
a0ed51b3 2425 nextchr = UCHARAT(locinput);
a0ed51b3 2426 }
1aa99e6b
IH
2427 else
2428 nextchr = UCHARAT(++locinput);
a0ed51b3 2429 break;
d6a28714 2430 case EXACT:
cd439c50
IZ
2431 s = STRING(scan);
2432 ln = STR_LEN(scan);
eb160463 2433 if (do_utf8 != UTF) {
bc517b45 2434 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2435 char *l = locinput;
2436 char *e = s + ln;
bc517b45 2437 STRLEN ulen;
a72c7584 2438
5ff6fc6d
JH
2439 if (do_utf8) {
2440 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2441 while (s < e) {
2442 if (l >= PL_regeol)
5ff6fc6d
JH
2443 sayNO;
2444 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2445 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2446 ckWARN(WARN_UTF8) ?
2447 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2448 sayNO;
bc517b45 2449 l += ulen;
5ff6fc6d 2450 s ++;
1aa99e6b 2451 }
5ff6fc6d
JH
2452 }
2453 else {
2454 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2455 while (s < e) {
2456 if (l >= PL_regeol)
2457 sayNO;
5ff6fc6d 2458 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2459 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2460 ckWARN(WARN_UTF8) ?
2461 0 : UTF8_ALLOW_ANY))
1aa99e6b 2462 sayNO;
bc517b45 2463 s += ulen;
a72c7584 2464 l ++;
1aa99e6b 2465 }
5ff6fc6d 2466 }
1aa99e6b
IH
2467 locinput = l;
2468 nextchr = UCHARAT(locinput);
2469 break;
2470 }
bc517b45 2471 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2472 /* Inline the first character, for speed. */
2473 if (UCHARAT(s) != nextchr)
2474 sayNO;
2475 if (PL_regeol - locinput < ln)
2476 sayNO;
2477 if (ln > 1 && memNE(s, locinput, ln))
2478 sayNO;
2479 locinput += ln;
2480 nextchr = UCHARAT(locinput);
2481 break;
2482 case EXACTFL:
b8c5462f
JH
2483 PL_reg_flags |= RF_tainted;
2484 /* FALL THROUGH */
d6a28714 2485 case EXACTF:
cd439c50
IZ
2486 s = STRING(scan);
2487 ln = STR_LEN(scan);
d6a28714 2488
d07ddd77
JH
2489 if (do_utf8 || UTF) {
2490 /* Either target or the pattern are utf8. */
d6a28714 2491 char *l = locinput;
d07ddd77 2492 char *e = PL_regeol;
bc517b45 2493
eb160463 2494 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2495 l, &e, 0, do_utf8)) {
5486206c
JH
2496 /* One more case for the sharp s:
2497 * pack("U0U*", 0xDF) =~ /ss/i,
2498 * the 0xC3 0x9F are the UTF-8
2499 * byte sequence for the U+00DF. */
2500 if (!(do_utf8 &&
2501 toLOWER(s[0]) == 's' &&
2502 ln >= 2 &&
2503 toLOWER(s[1]) == 's' &&
2504 (U8)l[0] == 0xC3 &&
2505 e - l >= 2 &&
2506 (U8)l[1] == 0x9F))
2507 sayNO;
2508 }
d07ddd77
JH
2509 locinput = e;
2510 nextchr = UCHARAT(locinput);
2511 break;
a0ed51b3 2512 }
d6a28714 2513
bc517b45
JH
2514 /* Neither the target and the pattern are utf8. */
2515
d6a28714
JH
2516 /* Inline the first character, for speed. */
2517 if (UCHARAT(s) != nextchr &&
2518 UCHARAT(s) != ((OP(scan) == EXACTF)
2519 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2520 sayNO;
d6a28714 2521 if (PL_regeol - locinput < ln)
b8c5462f 2522 sayNO;
d6a28714
JH
2523 if (ln > 1 && (OP(scan) == EXACTF
2524 ? ibcmp(s, locinput, ln)
2525 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2526 sayNO;
d6a28714
JH
2527 locinput += ln;
2528 nextchr = UCHARAT(locinput);
a0d0e21e 2529 break;
d6a28714 2530 case ANYOF:
ffc61ed2 2531 if (do_utf8) {
9e55ce06
JH
2532 STRLEN inclasslen = PL_regeol - locinput;
2533
ba7b4546 2534 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2535 sayNO_ANYOF;
ffc61ed2
JH
2536 if (locinput >= PL_regeol)
2537 sayNO;
0f0076b4 2538 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2539 nextchr = UCHARAT(locinput);
e0f9d4a8 2540 break;
ffc61ed2
JH
2541 }
2542 else {
2543 if (nextchr < 0)
2544 nextchr = UCHARAT(locinput);
7d3e948e 2545 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2546 sayNO_ANYOF;
ffc61ed2
JH
2547 if (!nextchr && locinput >= PL_regeol)
2548 sayNO;
2549 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2550 break;
2551 }
2552 no_anyof:
2553 /* If we might have the case of the German sharp s
2554 * in a casefolding Unicode character class. */
2555
ebc501f0
JH
2556 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2557 locinput += SHARP_S_SKIP;
e0f9d4a8 2558 nextchr = UCHARAT(locinput);
ffc61ed2 2559 }
e0f9d4a8
JH
2560 else
2561 sayNO;
b8c5462f 2562 break;
d6a28714 2563 case ALNUML:
b8c5462f
JH
2564 PL_reg_flags |= RF_tainted;
2565 /* FALL THROUGH */
d6a28714 2566 case ALNUM:
b8c5462f 2567 if (!nextchr)
4633a7c4 2568 sayNO;
ffc61ed2 2569 if (do_utf8) {
ad24be35 2570 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2571 if (!(OP(scan) == ALNUM
3568d838 2572 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2573 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2574 {
2575 sayNO;
a0ed51b3 2576 }
b8c5462f 2577 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2578 nextchr = UCHARAT(locinput);
2579 break;
2580 }
ffc61ed2 2581 if (!(OP(scan) == ALNUM
d6a28714 2582 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2583 sayNO;
b8c5462f 2584 nextchr = UCHARAT(++locinput);
a0d0e21e 2585 break;
d6a28714 2586 case NALNUML:
b8c5462f
JH
2587 PL_reg_flags |= RF_tainted;
2588 /* FALL THROUGH */
d6a28714
JH
2589 case NALNUM:
2590 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2591 sayNO;
ffc61ed2 2592 if (do_utf8) {
8269fa76 2593 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2594 if (OP(scan) == NALNUM
3568d838 2595 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2596 : isALNUM_LC_utf8((U8*)locinput))
2597 {
b8c5462f 2598 sayNO;
d6a28714 2599 }
b8c5462f
JH
2600 locinput += PL_utf8skip[nextchr];
2601 nextchr = UCHARAT(locinput);
2602 break;
2603 }
ffc61ed2 2604 if (OP(scan) == NALNUM
d6a28714 2605 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2606 sayNO;
76e3520e 2607 nextchr = UCHARAT(++locinput);
a0d0e21e 2608 break;
d6a28714
JH
2609 case BOUNDL:
2610 case NBOUNDL:
3280af22 2611 PL_reg_flags |= RF_tainted;
bbce6d69 2612 /* FALL THROUGH */
d6a28714
JH
2613 case BOUND:
2614 case NBOUND:
2615 /* was last char in word? */
ffc61ed2 2616 if (do_utf8) {
12d33761
HS
2617 if (locinput == PL_bostr)
2618 ln = '\n';
ffc61ed2 2619 else {
b4f7163a 2620 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2621
b4f7163a 2622 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2623 }
2624 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2625 ln = isALNUM_uni(ln);
8269fa76 2626 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2627 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2628 }
2629 else {
9041c2e3 2630 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2631 n = isALNUM_LC_utf8((U8*)locinput);
2632 }
a0ed51b3 2633 }
d6a28714 2634 else {
12d33761
HS
2635 ln = (locinput != PL_bostr) ?
2636 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2637 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2638 ln = isALNUM(ln);
2639 n = isALNUM(nextchr);
2640 }
2641 else {
2642 ln = isALNUM_LC(ln);
2643 n = isALNUM_LC(nextchr);
2644 }
d6a28714 2645 }
ffc61ed2
JH
2646 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2647 OP(scan) == BOUNDL))
2648 sayNO;
a0ed51b3 2649 break;
d6a28714 2650 case SPACEL:
3280af22 2651 PL_reg_flags |= RF_tainted;
bbce6d69 2652 /* FALL THROUGH */
d6a28714 2653 case SPACE:
9442cb0e 2654 if (!nextchr)
4633a7c4 2655 sayNO;
1aa99e6b 2656 if (do_utf8) {
fd400ab9 2657 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2658 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2659 if (!(OP(scan) == SPACE
3568d838 2660 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2661 : isSPACE_LC_utf8((U8*)locinput)))
2662 {
2663 sayNO;
2664 }
2665 locinput += PL_utf8skip[nextchr];
2666 nextchr = UCHARAT(locinput);
2667 break;
d6a28714 2668 }
ffc61ed2
JH
2669 if (!(OP(scan) == SPACE
2670 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2671 sayNO;
2672 nextchr = UCHARAT(++locinput);
2673 }
2674 else {
2675 if (!(OP(scan) == SPACE
2676 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2677 sayNO;
2678 nextchr = UCHARAT(++locinput);
a0ed51b3 2679 }
a0ed51b3 2680 break;
d6a28714 2681 case NSPACEL:
3280af22 2682 PL_reg_flags |= RF_tainted;
bbce6d69 2683 /* FALL THROUGH */
d6a28714 2684 case NSPACE:
9442cb0e 2685 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2686 sayNO;
1aa99e6b 2687 if (do_utf8) {
8269fa76 2688 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2689 if (OP(scan) == NSPACE
3568d838 2690 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2691 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2692 {
2693 sayNO;
2694 }
2695 locinput += PL_utf8skip[nextchr];
2696 nextchr = UCHARAT(locinput);
2697 break;
a0ed51b3 2698 }
ffc61ed2 2699 if (OP(scan) == NSPACE
d6a28714 2700 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2701 sayNO;
76e3520e 2702 nextchr = UCHARAT(++locinput);
a0d0e21e 2703 break;
d6a28714 2704 case DIGITL:
a0ed51b3
LW
2705 PL_reg_flags |= RF_tainted;
2706 /* FALL THROUGH */
d6a28714 2707 case DIGIT:
9442cb0e 2708 if (!nextchr)
a0ed51b3 2709 sayNO;
1aa99e6b 2710 if (do_utf8) {
8269fa76 2711 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2712 if (!(OP(scan) == DIGIT
3568d838 2713 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2714 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2715 {
a0ed51b3 2716 sayNO;
dfe13c55 2717 }
6f06b55f 2718 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2719 nextchr = UCHARAT(locinput);
2720 break;
2721 }
ffc61ed2 2722 if (!(OP(scan) == DIGIT
9442cb0e 2723 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2724 sayNO;
2725 nextchr = UCHARAT(++locinput);
2726 break;
d6a28714 2727 case NDIGITL:
b8c5462f
JH
2728 PL_reg_flags |= RF_tainted;
2729 /* FALL THROUGH */
d6a28714 2730 case NDIGIT:
9442cb0e 2731 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2732 sayNO;
1aa99e6b 2733 if (do_utf8) {
8269fa76 2734 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2735 if (OP(scan) == NDIGIT
3568d838 2736 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2737 : isDIGIT_LC_utf8((U8*)locinput))
2738 {
a0ed51b3 2739 sayNO;
9442cb0e 2740 }
6f06b55f 2741 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2742 nextchr = UCHARAT(locinput);
2743 break;
2744 }
ffc61ed2 2745 if (OP(scan) == NDIGIT
9442cb0e 2746 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2747 sayNO;
2748 nextchr = UCHARAT(++locinput);
2749 break;
2750 case CLUMP:
b7c83a7e 2751 if (locinput >= PL_regeol)
a0ed51b3 2752 sayNO;
b7c83a7e
JH
2753 if (do_utf8) {
2754 LOAD_UTF8_CHARCLASS(mark,"~");
2755 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2756 sayNO;
2757 locinput += PL_utf8skip[nextchr];
2758 while (locinput < PL_regeol &&
2759 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2760 locinput += UTF8SKIP(locinput);
2761 if (locinput > PL_regeol)
2762 sayNO;
eb08e2da
JH
2763 }
2764 else
2765 locinput++;
a0ed51b3
LW
2766 nextchr = UCHARAT(locinput);
2767 break;
c8756f30 2768 case REFFL:
3280af22 2769 PL_reg_flags |= RF_tainted;
c8756f30 2770 /* FALL THROUGH */
c277df42 2771 case REF:
c8756f30 2772 case REFF:
c277df42 2773 n = ARG(scan); /* which paren pair */
cf93c79d 2774 ln = PL_regstartp[n];
2c2d71f5 2775 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2776 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2777 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2778 if (ln == PL_regendp[n])
a0d0e21e 2779 break;
a0ed51b3 2780
cf93c79d 2781 s = PL_bostr + ln;
1aa99e6b 2782 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2783 char *l = locinput;
cf93c79d 2784 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2785 /*
2786 * Note that we can't do the "other character" lookup trick as
2787 * in the 8-bit case (no pun intended) because in Unicode we
2788 * have to map both upper and title case to lower case.
2789 */
2790 if (OP(scan) == REFF) {
a2a2844f 2791 STRLEN ulen1, ulen2;
e7ae6809
JH
2792 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2793 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2794 while (s < e) {
2795 if (l >= PL_regeol)
2796 sayNO;
a2a2844f
JH
2797 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2798 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2799 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2800 sayNO;
a2a2844f
JH
2801 s += ulen1;
2802 l += ulen2;
a0ed51b3
LW
2803 }
2804 }
2805 locinput = l;
2806 nextchr = UCHARAT(locinput);
2807 break;
2808 }
2809
a0d0e21e 2810 /* Inline the first character, for speed. */
76e3520e 2811 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2812 (OP(scan) == REF ||
2813 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2814 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2815 sayNO;
cf93c79d 2816 ln = PL_regendp[n] - ln;
3280af22 2817 if (locinput + ln > PL_regeol)
4633a7c4 2818 sayNO;
c8756f30
AK
2819 if (ln > 1 && (OP(scan) == REF
2820 ? memNE(s, locinput, ln)
2821 : (OP(scan) == REFF
2822 ? ibcmp(s, locinput, ln)
2823 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2824 sayNO;
a0d0e21e 2825 locinput += ln;
76e3520e 2826 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2827 break;
2828
2829 case NOTHING:
c277df42 2830 case TAIL:
a0d0e21e
LW
2831 break;
2832 case BACK:
2833 break;
c277df42
IZ
2834 case EVAL:
2835 {
2836 dSP;
533c011a 2837 OP_4tree *oop = PL_op;
3280af22 2838 COP *ocurcop = PL_curcop;
f3548bdc 2839 PAD *old_comppad;
c277df42 2840 SV *ret;
080c2dec 2841 struct regexp *oreg = PL_reg_re;
9041c2e3 2842
c277df42 2843 n = ARG(scan);
533c011a 2844 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2845 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2846 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2847 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2848
8e5e9ebe
RGS
2849 {
2850 SV **before = SP;
2851 CALLRUNOPS(aTHX); /* Scalar context. */
2852 SPAGAIN;
2853 if (SP == before)
075aa684 2854 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2855 else {
2856 ret = POPs;
2857 PUTBACK;
2858 }
2859 }
2860
0f5d15d6 2861 PL_op = oop;
f3548bdc 2862 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2863 PL_curcop = ocurcop;
c277df42 2864 if (logical) {
0f5d15d6
IZ
2865 if (logical == 2) { /* Postponed subexpression. */
2866 regexp *re;
22c35a8c 2867 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2868 re_cc_state state;
0f5d15d6 2869 CHECKPOINT cp, lastcp;
cb50f42d 2870 int toggleutf;
faf82a0b 2871 register SV *sv;
0f5d15d6 2872
faf82a0b
AE
2873 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2874 mg = mg_find(sv, PERL_MAGIC_qr);
2875 else if (SvSMAGICAL(ret)) {
2876 if (SvGMAGICAL(ret))
2877 sv_unmagic(ret, PERL_MAGIC_qr);
2878 else
2879 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2880 }
faf82a0b 2881
0f5d15d6
IZ
2882 if (mg) {
2883 re = (regexp *)mg->mg_obj;
df0003d4 2884 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2885 }
2886 else {
2887 STRLEN len;
2888 char *t = SvPV(ret, len);
2889 PMOP pm;
2890 char *oprecomp = PL_regprecomp;
2891 I32 osize = PL_regsize;
2892 I32 onpar = PL_regnpar;
2893
5fcd1c1b 2894 Zero(&pm, 1, PMOP);
cb50f42d 2895 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2896 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2897 if (!(SvFLAGS(ret)
faf82a0b
AE
2898 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2899 | SVs_GMG)))
14befaf4
DM
2900 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2901 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2902 PL_regprecomp = oprecomp;
2903 PL_regsize = osize;
2904 PL_regnpar = onpar;
2905 }
2906 DEBUG_r(
9041c2e3 2907 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2908 "Entering embedded `%s%.60s%s%s'\n",
2909 PL_colors[0],
2910 re->precomp,
2911 PL_colors[1],
2912 (strlen(re->precomp) > 60 ? "..." : ""))
2913 );
2914 state.node = next;
2915 state.prev = PL_reg_call_cc;
2916 state.cc = PL_regcc;
2917 state.re = PL_reg_re;
2918
2ab05381 2919 PL_regcc = 0;
9041c2e3 2920
0f5d15d6 2921 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2922 REGCP_SET(lastcp);
0f5d15d6
IZ
2923 cache_re(re);
2924 state.ss = PL_savestack_ix;
2925 *PL_reglastparen = 0;
a01268b5 2926 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2927 PL_reg_call_cc = &state;
2928 PL_reginput = locinput;
cb50f42d
YST
2929 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2930 ((re->reganch & ROPT_UTF8) != 0);
2931 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2932
2933 /* XXXX This is too dramatic a measure... */
2934 PL_reg_maxiter = 0;
2935
0f5d15d6 2936 if (regmatch(re->program + 1)) {
2c914db6
IZ
2937 /* Even though we succeeded, we need to restore
2938 global variables, since we may be wrapped inside
2939 SUSPEND, thus the match may be not finished yet. */
2940
2941 /* XXXX Do this only if SUSPENDed? */
2942 PL_reg_call_cc = state.prev;
2943 PL_regcc = state.cc;
2944 PL_reg_re = state.re;
2945 cache_re(PL_reg_re);
cb50f42d 2946 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2947
2948 /* XXXX This is too dramatic a measure... */
2949 PL_reg_maxiter = 0;
2950
2951 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2952 ReREFCNT_dec(re);
2953 regcpblow(cp);
2954 sayYES;
2955 }
0f5d15d6 2956 ReREFCNT_dec(re);
02db2b7b 2957 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2958 regcppop();
2959 PL_reg_call_cc = state.prev;
2960 PL_regcc = state.cc;
2961 PL_reg_re = state.re;
d3790889 2962 cache_re(PL_reg_re);
cb50f42d 2963 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2964
2965 /* XXXX This is too dramatic a measure... */
2966 PL_reg_maxiter = 0;
2967
8e514ae6 2968 logical = 0;
0f5d15d6
IZ
2969 sayNO;
2970 }
c277df42 2971 sw = SvTRUE(ret);
0f5d15d6 2972 logical = 0;
a0ed51b3 2973 }
080c2dec 2974 else {
3280af22 2975 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
2976 cache_re(oreg);
2977 }
c277df42
IZ
2978 break;
2979 }
a0d0e21e 2980 case OPEN:
c277df42 2981 n = ARG(scan); /* which paren pair */
3280af22
NIS
2982 PL_reg_start_tmp[n] = locinput;
2983 if (n > PL_regsize)
2984 PL_regsize = n;
a0d0e21e
LW
2985 break;
2986 case CLOSE:
c277df42 2987 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2988 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2989 PL_regendp[n] = locinput - PL_bostr;
eb160463 2990 if (n > (I32)*PL_reglastparen)
3280af22 2991 *PL_reglastparen = n;
a01268b5 2992 *PL_reglastcloseparen = n;
a0d0e21e 2993 break;
c277df42
IZ
2994 case GROUPP:
2995 n = ARG(scan); /* which paren pair */
eb160463 2996 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2997 break;
2998 case IFTHEN:
2c2d71f5 2999 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
3000 if (sw)
3001 next = NEXTOPER(NEXTOPER(scan));
3002 else {
3003 next = scan + ARG(scan);
3004 if (OP(next) == IFTHEN) /* Fake one. */
3005 next = NEXTOPER(NEXTOPER(next));
3006 }
3007 break;
3008 case LOGICAL:
0f5d15d6 3009 logical = scan->flags;
c277df42 3010 break;
2ab05381
IZ
3011/*******************************************************************
3012 PL_regcc contains infoblock about the innermost (...)* loop, and
3013 a pointer to the next outer infoblock.
3014
3015 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3016
3017 1) After matching X, regnode for CURLYX is processed;
3018
9041c2e3 3019 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3020 regmatch() recursively with the starting point at WHILEM node;
3021
3022 3) Each hit of WHILEM node tries to match A and Z (in the order
3023 depending on the current iteration, min/max of {min,max} and
3024 greediness). The information about where are nodes for "A"
3025 and "Z" is read from the infoblock, as is info on how many times "A"
3026 was already matched, and greediness.
3027
3028 4) After A matches, the same WHILEM node is hit again.
3029
3030 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3031 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3032 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3033 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3034 of the external loop.
3035
3036 Currently present infoblocks form a tree with a stem formed by PL_curcc
3037 and whatever it mentions via ->next, and additional attached trees
3038 corresponding to temporarily unset infoblocks as in "5" above.
3039
9041c2e3 3040 In the following picture infoblocks for outer loop of
2ab05381
IZ
3041 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3042 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3043 infoblocks are drawn below the "reset" infoblock.
3044
3045 In fact in the picture below we do not show failed matches for Z and T
3046 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3047 more obvious *why* one needs to *temporary* unset infoblocks.]
3048
3049 Matched REx position InfoBlocks Comment
3050 (Y(A)*?Z)*?T x
3051 Y(A)*?Z)*?T x <- O
3052 Y (A)*?Z)*?T x <- O
3053 Y A)*?Z)*?T x <- O <- I
3054 YA )*?Z)*?T x <- O <- I
3055 YA A)*?Z)*?T x <- O <- I
3056 YAA )*?Z)*?T x <- O <- I
3057 YAA Z)*?T x <- O # Temporary unset I
3058 I
3059
3060 YAAZ Y(A)*?Z)*?T x <- O
3061 I
3062
3063 YAAZY (A)*?Z)*?T x <- O
3064 I
3065
3066 YAAZY A)*?Z)*?T x <- O <- I
3067 I
3068
3069 YAAZYA )*?Z)*?T x <- O <- I
3070 I
3071
3072 YAAZYA Z)*?T x <- O # Temporary unset I
3073 I,I
3074
3075 YAAZYAZ )*?T x <- O
3076 I,I
3077
3078 YAAZYAZ T x # Temporary unset O
3079 O
3080 I,I
3081
3082 YAAZYAZT x
3083 O
3084 I,I
3085 *******************************************************************/
a0d0e21e
LW
3086 case CURLYX: {
3087 CURCUR cc;
3280af22 3088 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3089 /* No need to save/restore up to this paren */
3090 I32 parenfloor = scan->flags;
c277df42
IZ
3091
3092 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3093 next += ARG(next);
3280af22
NIS
3094 cc.oldcc = PL_regcc;
3095 PL_regcc = &cc;
cb434fcc
IZ
3096 /* XXXX Probably it is better to teach regpush to support
3097 parenfloor > PL_regsize... */
eb160463 3098 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3099 parenfloor = *PL_reglastparen; /* Pessimization... */
3100 cc.parenfloor = parenfloor;
a0d0e21e
LW
3101 cc.cur = -1;
3102 cc.min = ARG1(scan);
3103 cc.max = ARG2(scan);
c277df42 3104 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3105 cc.next = next;
3106 cc.minmod = minmod;
3107 cc.lastloc = 0;
3280af22 3108 PL_reginput = locinput;
a0d0e21e
LW
3109 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3110 regcpblow(cp);
3280af22 3111 PL_regcc = cc.oldcc;
4633a7c4 3112 saySAME(n);
a0d0e21e
LW
3113 }
3114 /* NOT REACHED */
3115 case WHILEM: {
3116 /*
3117 * This is really hard to understand, because after we match
3118 * what we're trying to match, we must make sure the rest of
2c2d71f5 3119 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3120 * to go back UP the parse tree by recursing ever deeper. And
3121 * if it fails, we have to reset our parent's current state
3122 * that we can try again after backing off.
3123 */
3124
c277df42 3125 CHECKPOINT cp, lastcp;
3280af22 3126 CURCUR* cc = PL_regcc;
c277df42
IZ
3127 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3128
4633a7c4 3129 n = cc->cur + 1; /* how many we know we matched */
3280af22 3130 PL_reginput = locinput;
a0d0e21e 3131
c277df42 3132 DEBUG_r(
9041c2e3 3133 PerlIO_printf(Perl_debug_log,
91f3b821 3134 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3135 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3136 (long)n, (long)cc->min,
2797576d 3137 (long)cc->max, PTR2UV(cc))
c277df42 3138 );
4633a7c4 3139
a0d0e21e
LW
3140 /* If degenerate scan matches "", assume scan done. */
3141
579cf2c3 3142 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3143 PL_regcc = cc->oldcc;
2ab05381
IZ
3144 if (PL_regcc)
3145 ln = PL_regcc->cur;
c277df42 3146 DEBUG_r(
c3464db5
DD
3147 PerlIO_printf(Perl_debug_log,
3148 "%*s empty match detected, try continuation...\n",
3280af22 3149 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3150 );
a0d0e21e 3151 if (regmatch(cc->next))
4633a7c4 3152 sayYES;
2ab05381
IZ
3153 if (PL_regcc)
3154 PL_regcc->cur = ln;
3280af22 3155 PL_regcc = cc;
4633a7c4 3156 sayNO;
a0d0e21e
LW
3157 }
3158
3159 /* First just match a string of min scans. */
3160
3161 if (n < cc->min) {
3162 cc->cur = n;
3163 cc->lastloc = locinput;
4633a7c4
LW
3164 if (regmatch(cc->scan))
3165 sayYES;
3166 cc->cur = n - 1;
c277df42 3167 cc->lastloc = lastloc;
4633a7c4 3168 sayNO;
a0d0e21e
LW
3169 }
3170
2c2d71f5
JH
3171 if (scan->flags) {
3172 /* Check whether we already were at this position.
3173 Postpone detection until we know the match is not
3174 *that* much linear. */
3175 if (!PL_reg_maxiter) {
3176 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3177 PL_reg_leftiter = PL_reg_maxiter;
3178 }
3179 if (PL_reg_leftiter-- == 0) {
3180 I32 size = (PL_reg_maxiter + 7)/8;
3181 if (PL_reg_poscache) {
eb160463 3182 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3183 Renew(PL_reg_poscache, size, char);
3184 PL_reg_poscache_size = size;
3185 }
3186 Zero(PL_reg_poscache, size, char);
3187 }
3188 else {
3189 PL_reg_poscache_size = size;
3190 Newz(29, PL_reg_poscache, size, char);
3191 }
3192 DEBUG_r(
3193 PerlIO_printf(Perl_debug_log,
3194 "%sDetected a super-linear match, switching on caching%s...\n",
3195 PL_colors[4], PL_colors[5])
3196 );
3197 }
3198 if (PL_reg_leftiter < 0) {
3199 I32 o = locinput - PL_bostr, b;
3200
3201 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3202 b = o % 8;
3203 o /= 8;
3204 if (PL_reg_poscache[o] & (1<<b)) {
3205 DEBUG_r(
3206 PerlIO_printf(Perl_debug_log,
3207 "%*s already tried at this position...\n",
3208 REPORT_CODE_OFF+PL_regindent*2, "")
3209 );
c2b0868c
HS
3210 if (PL_reg_flags & RF_false)
3211 sayYES;
3212 else
3213 sayNO_SILENT;
2c2d71f5
JH
3214 }
3215 PL_reg_poscache[o] |= (1<<b);
3216 }
3217 }
3218
a0d0e21e
LW
3219 /* Prefer next over scan for minimal matching. */
3220
3221 if (cc->minmod) {
3280af22 3222 PL_regcc = cc->oldcc;
2ab05381
IZ
3223 if (PL_regcc)
3224 ln = PL_regcc->cur;
5f05dabc 3225 cp = regcppush(cc->parenfloor);
02db2b7b 3226 REGCP_SET(lastcp);
5f05dabc 3227 if (regmatch(cc->next)) {
c277df42 3228 regcpblow(cp);
4633a7c4 3229 sayYES; /* All done. */
5f05dabc 3230 }
02db2b7b 3231 REGCP_UNWIND(lastcp);
5f05dabc 3232 regcppop();
2ab05381
IZ
3233 if (PL_regcc)
3234 PL_regcc->cur = ln;
3280af22 3235 PL_regcc = cc;
a0d0e21e 3236
c277df42 3237 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3238 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3239 && !(PL_reg_flags & RF_warned)) {
3240 PL_reg_flags |= RF_warned;
9014280d 3241 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3242 "Complex regular subexpression recursion",
3243 REG_INFTY - 1);
c277df42 3244 }
4633a7c4 3245 sayNO;
c277df42 3246 }
a687059c 3247
c277df42 3248 DEBUG_r(
c3464db5
DD
3249 PerlIO_printf(Perl_debug_log,
3250 "%*s trying longer...\n",
3280af22 3251 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3252 );
a0d0e21e 3253 /* Try scanning more and see if it helps. */
3280af22 3254 PL_reginput = locinput;
a0d0e21e
LW
3255 cc->cur = n;
3256 cc->lastloc = locinput;
5f05dabc 3257 cp = regcppush(cc->parenfloor);
02db2b7b 3258 REGCP_SET(lastcp);
5f05dabc 3259 if (regmatch(cc->scan)) {
c277df42 3260 regcpblow(cp);
4633a7c4 3261 sayYES;
5f05dabc 3262 }
02db2b7b 3263 REGCP_UNWIND(lastcp);
5f05dabc 3264 regcppop();
4633a7c4 3265 cc->cur = n - 1;
c277df42 3266 cc->lastloc = lastloc;
4633a7c4 3267 sayNO;
a0d0e21e
LW
3268 }
3269
3270 /* Prefer scan over next for maximal matching. */
3271
3272 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3273 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3274 cc->cur = n;
3275 cc->lastloc = locinput;
02db2b7b 3276 REGCP_SET(lastcp);
5f05dabc 3277 if (regmatch(cc->scan)) {
c277df42 3278 regcpblow(cp);
4633a7c4 3279 sayYES;
5f05dabc 3280 }
02db2b7b 3281 REGCP_UNWIND(lastcp);
a0d0e21e 3282 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3283 PL_reginput = locinput;
c277df42 3284 DEBUG_r(
c3464db5
DD
3285 PerlIO_printf(Perl_debug_log,
3286 "%*s failed, try continuation...\n",
3280af22 3287 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3288 );
3289 }
9041c2e3 3290 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3291 && !(PL_reg_flags & RF_warned)) {
3280af22 3292 PL_reg_flags |= RF_warned;
9014280d 3293 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3294 "Complex regular subexpression recursion",
3295 REG_INFTY - 1);
a0d0e21e
LW
3296 }
3297
3298 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3299 PL_regcc = cc->oldcc;
2ab05381
IZ
3300 if (PL_regcc)
3301 ln = PL_regcc->cur;
a0d0e21e 3302 if (regmatch(cc->next))
4633a7c4 3303 sayYES;
2ab05381
IZ
3304 if (PL_regcc)
3305 PL_regcc->cur = ln;
3280af22 3306 PL_regcc = cc;
4633a7c4 3307 cc->cur = n - 1;
c277df42 3308 cc->lastloc = lastloc;
4633a7c4 3309 sayNO;
a0d0e21e
LW
3310 }
3311 /* NOT REACHED */
9041c2e3 3312 case BRANCHJ:
c277df42
IZ
3313 next = scan + ARG(scan);
3314 if (next == scan)
3315 next = NULL;
3316 inner = NEXTOPER(NEXTOPER(scan));
3317 goto do_branch;
9041c2e3 3318 case BRANCH:
c277df42
IZ
3319 inner = NEXTOPER(scan);
3320 do_branch:
3321 {
c277df42
IZ
3322 c1 = OP(scan);
3323 if (OP(next) != c1) /* No choice. */
3324 next = inner; /* Avoid recursion. */
a0d0e21e 3325 else {
02db2b7b
IZ
3326 I32 lastparen = *PL_reglastparen;
3327 I32 unwind1;
3328 re_unwind_branch_t *uw;
3329
3330 /* Put unwinding data on stack */
3331 unwind1 = SSNEWt(1,re_unwind_branch_t);
3332 uw = SSPTRt(unwind1,re_unwind_branch_t);
3333 uw->prev = unwind;
3334 unwind = unwind1;
3335 uw->type = ((c1 == BRANCH)
3336 ? RE_UNWIND_BRANCH
3337 : RE_UNWIND_BRANCHJ);
3338 uw->lastparen = lastparen;
3339 uw->next = next;
3340 uw->locinput = locinput;
3341 uw->nextchr = nextchr;
3342#ifdef DEBUGGING
3343 uw->regindent = ++PL_regindent;
3344#endif
c277df42 3345
02db2b7b
IZ
3346 REGCP_SET(uw->lastcp);
3347
3348 /* Now go into the first branch */
3349 next = inner;
a687059c 3350 }
a0d0e21e
LW
3351 }
3352 break;
3353 case MINMOD:
3354 minmod = 1;
3355 break;
c277df42
IZ
3356 case CURLYM:
3357 {
00db4c45 3358 I32 l = 0;
c277df42 3359 CHECKPOINT lastcp;
9041c2e3 3360
c277df42
IZ
3361 /* We suppose that the next guy does not need
3362 backtracking: in particular, it is of constant length,
3363 and has no parenths to influence future backrefs. */
3364 ln = ARG1(scan); /* min to match */
3365 n = ARG2(scan); /* max to match */
c277df42
IZ
3366 paren = scan->flags;
3367 if (paren) {
3280af22
NIS
3368 if (paren > PL_regsize)
3369 PL_regsize = paren;
eb160463 3370 if (paren > (I32)*PL_reglastparen)
3280af22 3371 *PL_reglastparen = paren;
c277df42 3372 }
dc45a647 3373 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3374 if (paren)
3375 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3376 PL_reginput = locinput;
c277df42
IZ
3377 if (minmod) {
3378 minmod = 0;
3379 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3380 sayNO;
f31a99c8
HS
3381 /* if we matched something zero-length we don't need to
3382 backtrack - capturing parens are already defined, so
3383 the caveat in the maximal case doesn't apply
3384
3385 XXXX if ln == 0, we can redo this check first time
3386 through the following loop
3387 */
3388 if (ln && l == 0)
3389 n = ln; /* don't backtrack */
3280af22 3390 locinput = PL_reginput;
cca55fe3 3391 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3392 regnode *text_node = next;
3393
cca55fe3 3394 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3395
cca55fe3 3396 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3397 else {
cca55fe3 3398 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3399 c1 = c2 = -1000;
3400 goto assume_ok_MM;
cca55fe3
JP
3401 }
3402 else { c1 = (U8)*STRING(text_node); }
af5decee 3403 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3404 c2 = PL_fold[c1];
af5decee 3405 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3406 c2 = PL_fold_locale[c1];
3407 else
3408 c2 = c1;
3409 }
a0ed51b3
LW
3410 }
3411 else
c277df42 3412 c1 = c2 = -1000;
cca55fe3 3413 assume_ok_MM:
02db2b7b 3414 REGCP_SET(lastcp);
5f4b28b2 3415 /* This may be improved if l == 0. */
c277df42
IZ
3416 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3417 /* If it could work, try it. */
3418 if (c1 == -1000 ||
3280af22
NIS
3419 UCHARAT(PL_reginput) == c1 ||
3420 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3421 {
3422 if (paren) {
f31a99c8 3423 if (ln) {
cf93c79d
IZ
3424 PL_regstartp[paren] =
3425 HOPc(PL_reginput, -l) - PL_bostr;
3426 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3427 }
3428 else
cf93c79d 3429 PL_regendp[paren] = -1;
c277df42
IZ
3430 }
3431 if (regmatch(next))
3432 sayYES;
02db2b7b 3433 REGCP_UNWIND(lastcp);
c277df42
IZ
3434 }
3435 /* Couldn't or didn't -- move forward. */
3280af22 3436 PL_reginput = locinput;
c277df42
IZ
3437 if (regrepeat_hard(scan, 1, &l)) {
3438 ln++;
3280af22 3439 locinput = PL_reginput;
c277df42
IZ
3440 }
3441 else
3442 sayNO;
3443 }
a0ed51b3
LW
3444 }
3445 else {
c277df42 3446 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3447 /* if we matched something zero-length we don't need to
3448 backtrack, unless the minimum count is zero and we
3449 are capturing the result - in that case the capture
3450 being defined or not may affect later execution
3451 */
3452 if (n != 0 && l == 0 && !(paren && ln == 0))
3453 ln = n; /* don't backtrack */
3280af22 3454 locinput = PL_reginput;
c277df42 3455 DEBUG_r(
5c0ca799 3456 PerlIO_printf(Perl_debug_log,
faccc32b 3457 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3458 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3459 (IV) n, (IV)l)
c277df42
IZ
3460 );
3461 if (n >= ln) {
cca55fe3 3462 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3463 regnode *text_node = next;
3464
cca55fe3 3465 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3466
cca55fe3 3467 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3468 else {
cca55fe3 3469 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3470 c1 = c2 = -1000;
3471 goto assume_ok_REG;
cca55fe3
JP
3472 }
3473 else { c1 = (U8)*STRING(text_node); }
3474
af5decee 3475 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3476 c2 = PL_fold[c1];
af5decee 3477 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3478 c2 = PL_fold_locale[c1];
3479 else
3480 c2 = c1;
3481 }
a0ed51b3
LW
3482 }
3483 else
c277df42
IZ
3484 c1 = c2 = -1000;
3485 }
cca55fe3 3486 assume_ok_REG:
02db2b7b 3487 REGCP_SET(lastcp);
c277df42
IZ
3488 while (n >= ln) {
3489 /* If it could work, try it. */
3490 if (c1 == -1000 ||
3280af22
NIS
3491 UCHARAT(PL_reginput) == c1 ||
3492 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3493 {
3494 DEBUG_r(
c3464db5 3495 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3496 "%*s trying tail with n=%"IVdf"...\n",
3497 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3498 );
3499 if (paren) {
3500 if (n) {
cf93c79d
IZ
3501 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3502 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3503 }
a0ed51b3 3504 else
cf93c79d 3505 PL_regendp[paren] = -1;
c277df42 3506 }
a0ed51b3
LW
3507 if (regmatch(next))
3508 sayYES;
02db2b7b 3509 REGCP_UNWIND(lastcp);
a0ed51b3 3510 }
c277df42
IZ
3511 /* Couldn't or didn't -- back up. */
3512 n--;
dfe13c55 3513 locinput = HOPc(locinput, -l);
3280af22 3514 PL_reginput = locinput;
c277df42
IZ
3515 }
3516 }
3517 sayNO;
3518 break;
3519 }
3520 case CURLYN:
3521 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3522 if (paren > PL_regsize)
3523 PL_regsize = paren;
eb160463 3524 if (paren > (I32)*PL_reglastparen)
3280af22 3525 *PL_reglastparen = paren;
c277df42
IZ
3526 ln = ARG1(scan); /* min to match */
3527 n = ARG2(scan); /* max to match */
dc45a647 3528 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3529 goto repeat;
a0d0e21e 3530 case CURLY:
c277df42 3531 paren = 0;
a0d0e21e
LW
3532 ln = ARG1(scan); /* min to match */
3533 n = ARG2(scan); /* max to match */
dc45a647 3534 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3535 goto repeat;
3536 case STAR:
3537 ln = 0;
c277df42 3538 n = REG_INFTY;
a0d0e21e 3539 scan = NEXTOPER(scan);
c277df42 3540 paren = 0;
a0d0e21e
LW
3541 goto repeat;
3542 case PLUS:
c277df42
IZ
3543 ln = 1;
3544 n = REG_INFTY;
3545 scan = NEXTOPER(scan);
3546 paren = 0;
3547 repeat:
a0d0e21e
LW
3548 /*
3549 * Lookahead to avoid useless match attempts
3550 * when we know what character comes next.
3551 */
5f80c4cf
JP
3552
3553 /*
3554 * Used to only do .*x and .*?x, but now it allows
3555 * for )'s, ('s and (?{ ... })'s to be in the way
3556 * of the quantifier and the EXACT-like node. -- japhy
3557 */
3558
cca55fe3 3559 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3560 U8 *s;
3561 regnode *text_node = next;
3562
cca55fe3 3563 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3564
cca55fe3 3565 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3566 else {
cca55fe3 3567 if (PL_regkind[(U8)OP(text_node)] == REF) {
44a68960
JH
3568 c1 = c2 = -1000;
3569 goto assume_ok_easy;
cca55fe3
JP
3570 }
3571 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3572
3573 if (!UTF) {
3574 c2 = c1 = *s;
f65d3ee7 3575 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3576 c2 = PL_fold[c1];
f65d3ee7 3577 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3578 c2 = PL_fold_locale[c1];
1aa99e6b 3579 }
5f80c4cf 3580 else { /* UTF */
f65d3ee7 3581 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3582 STRLEN ulen1, ulen2;
e7ae6809
JH
3583 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3584 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3585
3586 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3587 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3588
872c91ae
JH
3589 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3590 ckWARN(WARN_UTF8) ?
3591 0 : UTF8_ALLOW_ANY);
3592 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3593 ckWARN(WARN_UTF8) ?
3594 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3595 }
3596 else {
872c91ae
JH
3597 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3598 ckWARN(WARN_UTF8) ?
3599 0 : UTF8_ALLOW_ANY);
5f80c4cf 3600 }
1aa99e6b
IH
3601 }
3602 }
bbce6d69 3603 }
a0d0e21e 3604 else
bbce6d69 3605 c1 = c2 = -1000;
cca55fe3 3606 assume_ok_easy:
3280af22 3607 PL_reginput = locinput;
a0d0e21e 3608 if (minmod) {
c277df42 3609 CHECKPOINT lastcp;
a0d0e21e
LW
3610 minmod = 0;
3611 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3612 sayNO;
a0ed51b3 3613 locinput = PL_reginput;
02db2b7b 3614 REGCP_SET(lastcp);
0fe9bf95 3615 if (c1 != -1000) {
1aa99e6b 3616 char *e; /* Should not check after this */
0fe9bf95 3617 char *old = locinput;
b2f2f093 3618 int count = 0;
0fe9bf95 3619
1aa99e6b 3620 if (n == REG_INFTY) {
0fe9bf95 3621 e = PL_regeol - 1;
1aa99e6b
IH
3622 if (do_utf8)
3623 while (UTF8_IS_CONTINUATION(*(U8*)e))
3624 e--;
3625 }
3626 else if (do_utf8) {
3627 int m = n - ln;
3628 for (e = locinput;
3629 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3630 e += UTF8SKIP(e);
3631 }
3632 else {
3633 e = locinput + n - ln;
3634 if (e >= PL_regeol)
3635 e = PL_regeol - 1;
3636 }
0fe9bf95
IZ
3637 while (1) {
3638 /* Find place 'next' could work */
1aa99e6b
IH
3639 if (!do_utf8) {
3640 if (c1 == c2) {
a8e8ab15
JH
3641 while (locinput <= e &&
3642 UCHARAT(locinput) != c1)
1aa99e6b
IH
3643 locinput++;
3644 } else {
9041c2e3 3645 while (locinput <= e
a8e8ab15
JH
3646 && UCHARAT(locinput) != c1
3647 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3648 locinput++;
3649 }
3650 count = locinput - old;
3651 }
3652 else {
3653 STRLEN len;
3654 if (c1 == c2) {
872c91ae
JH
3655 /* count initialised to
3656 * utf8_distance(old, locinput) */
b2f2f093 3657 while (locinput <= e &&
872c91ae
JH
3658 utf8n_to_uvchr((U8*)locinput,
3659 UTF8_MAXLEN, &len,
3660 ckWARN(WARN_UTF8) ?
eb160463 3661 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3662 locinput += len;
b2f2f093
JH
3663 count++;
3664 }
1aa99e6b 3665 } else {
872c91ae
JH
3666 /* count initialised to
3667 * utf8_distance(old, locinput) */
b2f2f093 3668 while (locinput <= e) {
872c91ae
JH
3669 UV c = utf8n_to_uvchr((U8*)locinput,
3670 UTF8_MAXLEN, &len,
3671 ckWARN(WARN_UTF8) ?
3672 0 : UTF8_ALLOW_ANY);
eb160463 3673 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3674 break;
b2f2f093
JH
3675 locinput += len;
3676 count++;
1aa99e6b
IH
3677 }
3678 }
0fe9bf95 3679 }
9041c2e3 3680 if (locinput > e)
0fe9bf95
IZ
3681 sayNO;
3682 /* PL_reginput == old now */
3683 if (locinput != old) {
3684 ln = 1; /* Did some */
1aa99e6b 3685 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3686 sayNO;
3687 }
3688 /* PL_reginput == locinput now */
29d1e993 3689 TRYPAREN(paren, ln, locinput);
0fe9bf95 3690 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3691 REGCP_UNWIND(lastcp);
0fe9bf95 3692 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3693 old = locinput;
3694 if (do_utf8)
3695 locinput += UTF8SKIP(locinput);
3696 else
3697 locinput++;
b2f2f093 3698 count = 1;
0fe9bf95
IZ
3699 }
3700 }
3701 else
c277df42 3702 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3703 UV c;
3704 if (c1 != -1000) {
3705 if (do_utf8)
872c91ae
JH
3706 c = utf8n_to_uvchr((U8*)PL_reginput,
3707 UTF8_MAXLEN, 0,
3708 ckWARN(WARN_UTF8) ?
3709 0 : UTF8_ALLOW_ANY);
1aa99e6b 3710 else
9041c2e3 3711 c = UCHARAT(PL_reginput);
2390ecbc 3712 /* If it could work, try it. */
eb160463 3713 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3714 {
ecc99935 3715 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3716 REGCP_UNWIND(lastcp);
3717 }
1aa99e6b 3718 }
a0d0e21e 3719 /* If it could work, try it. */
2390ecbc 3720 else if (c1 == -1000)
bbce6d69 3721 {
ecc99935 3722 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3723 REGCP_UNWIND(lastcp);
bbce6d69 3724 }
c277df42 3725 /* Couldn't or didn't -- move forward. */
a0ed51b3 3726 PL_reginput = locinput;
a0d0e21e
LW
3727 if (regrepeat(scan, 1)) {
3728 ln++;
a0ed51b3
LW
3729 locinput = PL_reginput;
3730 }
3731 else
4633a7c4 3732 sayNO;
a0d0e21e
LW
3733 }
3734 }
3735 else {
c277df42 3736 CHECKPOINT lastcp;
a0d0e21e 3737 n = regrepeat(scan, n);
a0ed51b3 3738 locinput = PL_reginput;
22c35a8c 3739 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3740 ((!PL_multiline && OP(next) != MEOL) ||
3741 OP(next) == SEOL || OP(next) == EOS))
3742 {
a0d0e21e 3743 ln = n; /* why back off? */
1aeab75a
GS
3744 /* ...because $ and \Z can match before *and* after
3745 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3746 We should back off by one in this case. */
3747 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3748 ln--;
3749 }
02db2b7b 3750 REGCP_SET(lastcp);
c277df42 3751 if (paren) {
8fa7f367 3752 UV c = 0;
c277df42 3753 while (n >= ln) {
1aa99e6b
IH
3754 if (c1 != -1000) {
3755 if (do_utf8)
872c91ae
JH
3756 c = utf8n_to_uvchr((U8*)PL_reginput,
3757 UTF8_MAXLEN, 0,
3758 ckWARN(WARN_UTF8) ?
3759 0 : UTF8_ALLOW_ANY);
1aa99e6b 3760 else
9041c2e3 3761 c = UCHARAT(PL_reginput);
1aa99e6b 3762 }
c277df42 3763 /* If it could work, try it. */
eb160463 3764 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3765 {
29d1e993 3766 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3767 REGCP_UNWIND(lastcp);
c277df42
IZ
3768 }
3769 /* Couldn't or didn't -- back up. */
3770 n--;
dfe13c55 3771 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3772 }
a0ed51b3
LW
3773 }
3774 else {
8fa7f367 3775 UV c = 0;
c277df42 3776 while (n >= ln) {
1aa99e6b
IH
3777 if (c1 != -1000) {
3778 if (do_utf8)
872c91ae
JH
3779 c = utf8n_to_uvchr((U8*)PL_reginput,
3780 UTF8_MAXLEN, 0,
3781 ckWARN(WARN_UTF8) ?
3782 0 : UTF8_ALLOW_ANY);
1aa99e6b 3783 else
9041c2e3 3784 c = UCHARAT(PL_reginput);
1aa99e6b 3785 }
c277df42 3786 /* If it could work, try it. */
eb160463 3787 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3788 {
29d1e993 3789 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3790 REGCP_UNWIND(lastcp);
c277df42
IZ
3791 }
3792 /* Couldn't or didn't -- back up. */
3793 n--;
dfe13c55 3794 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3795 }
a0d0e21e
LW
3796 }
3797 }
4633a7c4 3798 sayNO;
c277df42 3799 break;
a0d0e21e 3800 case END:
0f5d15d6
IZ
3801 if (PL_reg_call_cc) {
3802 re_cc_state *cur_call_cc = PL_reg_call_cc;
3803 CURCUR *cctmp = PL_regcc;
3804 regexp *re = PL_reg_re;
3805 CHECKPOINT cp, lastcp;
3806
3807 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3808 REGCP_SET(lastcp);
0f5d15d6
IZ
3809 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3810 the caller. */
3811 PL_reginput = locinput; /* Make position available to
3812 the callcc. */
3813 cache_re(PL_reg_call_cc->re);
3814 PL_regcc = PL_reg_call_cc->cc;
3815 PL_reg_call_cc = PL_reg_call_cc->prev;
3816 if (regmatch(cur_call_cc->node)) {
3817 PL_reg_call_cc = cur_call_cc;
3818 regcpblow(cp);
3819 sayYES;
3820 }
02db2b7b 3821 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3822 regcppop();
3823 PL_reg_call_cc = cur_call_cc;
3824 PL_regcc = cctmp;
3825 PL_reg_re = re;
3826 cache_re(re);
3827
3828 DEBUG_r(
3829 PerlIO_printf(Perl_debug_log,
3830 "%*s continuation failed...\n",
3831 REPORT_CODE_OFF+PL_regindent*2, "")
3832 );
7821416a 3833 sayNO_SILENT;
0f5d15d6 3834 }
7821416a
IZ
3835 if (locinput < PL_regtill) {
3836 DEBUG_r(PerlIO_printf(Perl_debug_log,
3837 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3838 PL_colors[4],
3839 (long)(locinput - PL_reg_starttry),
3840 (long)(PL_regtill - PL_reg_starttry),
3841 PL_colors[5]));
3842 sayNO_FINAL; /* Cannot match: too short. */
3843 }
3844 PL_reginput = locinput; /* put where regtry can find it */
3845 sayYES_FINAL; /* Success! */
7e5428c5 3846 case SUCCEED:
3280af22 3847 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3848 sayYES_LOUD; /* Success! */
c277df42
IZ
3849 case SUSPEND:
3850 n = 1;
9fe1d20c 3851 PL_reginput = locinput;
9041c2e3 3852 goto do_ifmatch;
a0d0e21e 3853 case UNLESSM:
c277df42 3854 n = 0;
a0ed51b3 3855 if (scan->flags) {
efb30f32
HS
3856 s = HOPBACKc(locinput, scan->flags);
3857 if (!s)
3858 goto say_yes;
3859 PL_reginput = s;
a0ed51b3
LW
3860 }
3861 else
3862 PL_reginput = locinput;
c2b0868c 3863 PL_reg_flags ^= RF_false;
c277df42
IZ
3864 goto do_ifmatch;
3865 case IFMATCH:
3866 n = 1;
a0ed51b3 3867 if (scan->flags) {
efb30f32
HS
3868 s = HOPBACKc(locinput, scan->flags);
3869 if (!s)
3870 goto say_no;
3871 PL_reginput = s;
a0ed51b3
LW
3872 }
3873 else
3874 PL_reginput = locinput;
3875
c277df42 3876 do_ifmatch:
c277df42
IZ
3877 inner = NEXTOPER(NEXTOPER(scan));
3878 if (regmatch(inner) != n) {
c2b0868c
HS
3879 if (n == 0)
3880 PL_reg_flags ^= RF_false;
c277df42
IZ
3881 say_no:
3882 if (logical) {
3883 logical = 0;
3884 sw = 0;
3885 goto do_longjump;
a0ed51b3
LW
3886 }
3887 else
c277df42
IZ
3888 sayNO;
3889 }
c2b0868c
HS
3890 if (n == 0)
3891 PL_reg_flags ^= RF_false;
c277df42
IZ
3892 say_yes:
3893 if (logical) {
3894 logical = 0;
3895 sw = 1;
3896 }
fe44a5e8 3897 if (OP(scan) == SUSPEND) {
3280af22 3898 locinput = PL_reginput;
565764a8 3899 nextchr = UCHARAT(locinput);
fe44a5e8 3900 }
c277df42
IZ
3901 /* FALL THROUGH. */
3902 case LONGJMP:
3903 do_longjump:
3904 next = scan + ARG(scan);
3905 if (next == scan)
3906 next = NULL;
a0d0e21e
LW
3907 break;
3908 default:
b900a521 3909 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3910 PTR2UV(scan), OP(scan));
cea2e8a9 3911 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3912 }
02db2b7b 3913 reenter:
a0d0e21e
LW
3914 scan = next;
3915 }
a687059c 3916
a0d0e21e
LW
3917 /*
3918 * We get here only if there's trouble -- normally "case END" is
3919 * the terminating point.
3920 */
cea2e8a9 3921 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3922 /*NOTREACHED*/
4633a7c4
LW
3923 sayNO;
3924
7821416a
IZ
3925yes_loud:
3926 DEBUG_r(
3927 PerlIO_printf(Perl_debug_log,
3928 "%*s %scould match...%s\n",
3929 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3930 );
3931 goto yes;
3932yes_final:
3933 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3934 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3935yes:
3936#ifdef DEBUGGING
3280af22 3937 PL_regindent--;
4633a7c4 3938#endif
02db2b7b
IZ
3939
3940#if 0 /* Breaks $^R */
3941 if (unwind)
3942 regcpblow(firstcp);
3943#endif
4633a7c4
LW
3944 return 1;
3945
3946no:
7821416a
IZ
3947 DEBUG_r(
3948 PerlIO_printf(Perl_debug_log,
3949 "%*s %sfailed...%s\n",
3950 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3951 );
3952 goto do_no;
3953no_final:
3954do_no:
02db2b7b
IZ
3955 if (unwind) {
3956 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3957
3958 switch (uw->type) {
3959 case RE_UNWIND_BRANCH:
3960 case RE_UNWIND_BRANCHJ:
3961 {
3962 re_unwind_branch_t *uwb = &(uw->branch);
3963 I32 lastparen = uwb->lastparen;
9041c2e3 3964
02db2b7b
IZ
3965 REGCP_UNWIND(uwb->lastcp);
3966 for (n = *PL_reglastparen; n > lastparen; n--)
3967 PL_regendp[n] = -1;
3968 *PL_reglastparen = n;
3969 scan = next = uwb->next;
9041c2e3
NIS
3970 if ( !scan ||
3971 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3972 ? BRANCH : BRANCHJ) ) { /* Failure */
3973 unwind = uwb->prev;
3974#ifdef DEBUGGING
3975 PL_regindent--;
3976#endif
3977 goto do_no;
3978 }
3979 /* Have more choice yet. Reuse the same uwb. */
3980 /*SUPPRESS 560*/
3981 if ((n = (uwb->type == RE_UNWIND_BRANCH
3982 ? NEXT_OFF(next) : ARG(next))))
3983 next += n;
3984 else
3985 next = NULL; /* XXXX Needn't unwinding in this case... */
3986 uwb->next = next;
3987 next = NEXTOPER(scan);
3988 if (uwb->type == RE_UNWIND_BRANCHJ)
3989 next = NEXTOPER(next);
3990 locinput = uwb->locinput;
3991 nextchr = uwb->nextchr;
3992#ifdef DEBUGGING
3993 PL_regindent = uwb->regindent;
3994#endif
3995
3996 goto reenter;
3997 }
3998 /* NOT REACHED */
3999 default:
4000 Perl_croak(aTHX_ "regexp unwind memory corruption");
4001 }
4002 /* NOT REACHED */
4003 }
4633a7c4 4004#ifdef DEBUGGING
3280af22 4005 PL_regindent--;
4633a7c4 4006#endif
a0d0e21e 4007 return 0;
a687059c
LW
4008}
4009
4010/*
4011 - regrepeat - repeatedly match something simple, report how many
4012 */
4013/*
4014 * [This routine now assumes that it will only match on things of length 1.
4015 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4016 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4017 */
76e3520e 4018STATIC I32
cea2e8a9 4019S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4020{
a0d0e21e 4021 register char *scan;
a0d0e21e 4022 register I32 c;
3280af22 4023 register char *loceol = PL_regeol;
a0ed51b3 4024 register I32 hardcount = 0;
53c4c00c 4025 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4026
3280af22 4027 scan = PL_reginput;
faf11cac
HS
4028 if (max == REG_INFTY)
4029 max = I32_MAX;
4030 else if (max < loceol - scan)
a0d0e21e 4031 loceol = scan + max;
a0d0e21e 4032 switch (OP(p)) {
22c35a8c 4033 case REG_ANY:
1aa99e6b 4034 if (do_utf8) {
ffc61ed2 4035 loceol = PL_regeol;
1aa99e6b 4036 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4037 scan += UTF8SKIP(scan);
4038 hardcount++;
4039 }
4040 } else {
4041 while (scan < loceol && *scan != '\n')
4042 scan++;
a0ed51b3
LW
4043 }
4044 break;
ffc61ed2 4045 case SANY:
def8e4ea
JH
4046 if (do_utf8) {
4047 loceol = PL_regeol;
a0804c9e 4048 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4049 scan += UTF8SKIP(scan);
4050 hardcount++;
4051 }
4052 }
4053 else
4054 scan = loceol;
a0ed51b3 4055 break;
f33976b4
DB
4056 case CANY:
4057 scan = loceol;
4058 break;
bbce6d69 4059 case EXACT: /* length of string is 1 */
cd439c50 4060 c = (U8)*STRING(p);
bbce6d69 4061 while (scan < loceol && UCHARAT(scan) == c)
4062 scan++;
4063 break;
4064 case EXACTF: /* length of string is 1 */
cd439c50 4065 c = (U8)*STRING(p);
bbce6d69 4066 while (scan < loceol &&
22c35a8c 4067 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4068 scan++;
4069 break;
4070 case EXACTFL: /* length of string is 1 */
3280af22 4071 PL_reg_flags |= RF_tainted;
cd439c50 4072 c = (U8)*STRING(p);
bbce6d69 4073 while (scan < loceol &&
22c35a8c 4074 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4075 scan++;
4076 break;
4077 case ANYOF:
ffc61ed2
JH
4078 if (do_utf8) {
4079 loceol = PL_regeol;
cfc92286
JH
4080 while (hardcount < max && scan < loceol &&
4081 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4082 scan += UTF8SKIP(scan);
4083 hardcount++;
4084 }
4085 } else {
7d3e948e 4086 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4087 scan++;
4088 }
a0d0e21e
LW
4089 break;
4090 case ALNUM:
1aa99e6b 4091 if (do_utf8) {
ffc61ed2 4092 loceol = PL_regeol;
8269fa76 4093 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4094 while (hardcount < max && scan < loceol &&
3568d838 4095 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4096 scan += UTF8SKIP(scan);
4097 hardcount++;
4098 }
4099 } else {
4100 while (scan < loceol && isALNUM(*scan))
4101 scan++;
a0ed51b3
LW
4102 }
4103 break;
bbce6d69 4104 case ALNUML:
3280af22 4105 PL_reg_flags |= RF_tainted;
1aa99e6b 4106 if (do_utf8) {
ffc61ed2 4107 loceol = PL_regeol;
1aa99e6b
IH
4108 while (hardcount < max && scan < loceol &&
4109 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4110 scan += UTF8SKIP(scan);
4111 hardcount++;
4112 }
4113 } else {
4114 while (scan < loceol && isALNUM_LC(*scan))
4115 scan++;
a0ed51b3
LW
4116 }
4117 break;
a0d0e21e 4118 case NALNUM:
1aa99e6b 4119 if (do_utf8) {
ffc61ed2 4120 loceol = PL_regeol;
8269fa76 4121 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4122 while (hardcount < max && scan < loceol &&
3568d838 4123 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4124 scan += UTF8SKIP(scan);
4125 hardcount++;
4126 }
4127 } else {
4128 while (scan < loceol && !isALNUM(*scan))
4129 scan++;
a0ed51b3
LW
4130 }
4131 break;
bbce6d69 4132 case NALNUML:
3280af22 4133 PL_reg_flags |= RF_tainted;
1aa99e6b 4134 if (do_utf8) {
ffc61ed2 4135 loceol = PL_regeol;
1aa99e6b
IH
4136 while (hardcount < max && scan < loceol &&
4137 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4138 scan += UTF8SKIP(scan);
4139 hardcount++;
4140 }
4141 } else {
4142 while (scan < loceol && !isALNUM_LC(*scan))
4143 scan++;
a0ed51b3
LW
4144 }
4145 break;
a0d0e21e 4146 case SPACE:
1aa99e6b 4147 if (do_utf8) {
ffc61ed2 4148 loceol = PL_regeol;
8269fa76 4149 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4150 while (hardcount < max && scan < loceol &&
3568d838
JH
4151 (*scan == ' ' ||
4152 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4153 scan += UTF8SKIP(scan);
4154 hardcount++;
4155 }
4156 } else {
4157 while (scan < loceol && isSPACE(*scan))
4158 scan++;
a0ed51b3
LW
4159 }
4160 break;
bbce6d69 4161 case SPACEL:
3280af22 4162 PL_reg_flags |= RF_tainted;
1aa99e6b 4163 if (do_utf8) {
ffc61ed2 4164 loceol = PL_regeol;
1aa99e6b 4165 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4166 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4167 scan += UTF8SKIP(scan);
4168 hardcount++;
4169 }
4170 } else {
4171 while (scan < loceol && isSPACE_LC(*scan))
4172 scan++;
a0ed51b3
LW
4173 }
4174 break;
a0d0e21e 4175 case NSPACE:
1aa99e6b 4176 if (do_utf8) {
ffc61ed2 4177 loceol = PL_regeol;
8269fa76 4178 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4179 while (hardcount < max && scan < loceol &&
3568d838
JH
4180 !(*scan == ' ' ||
4181 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4182 scan += UTF8SKIP(scan);
4183 hardcount++;
4184 }
4185 } else {
4186 while (scan < loceol && !isSPACE(*scan))
4187 scan++;
4188 break;
a0ed51b3 4189 }
bbce6d69 4190 case NSPACEL:
3280af22 4191 PL_reg_flags |= RF_tainted;
1aa99e6b 4192 if (do_utf8) {
ffc61ed2 4193 loceol = PL_regeol;
1aa99e6b 4194 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4195 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4196 scan += UTF8SKIP(scan);
4197 hardcount++;
4198 }
4199 } else {
4200 while (scan < loceol && !isSPACE_LC(*scan))
4201 scan++;
a0ed51b3
LW
4202 }
4203 break;
a0d0e21e 4204 case DIGIT:
1aa99e6b 4205 if (do_utf8) {
ffc61ed2 4206 loceol = PL_regeol;
8269fa76 4207 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4208 while (hardcount < max && scan < loceol &&
3568d838 4209 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4210 scan += UTF8SKIP(scan);
4211 hardcount++;
4212 }
4213 } else {
4214 while (scan < loceol && isDIGIT(*scan))
4215 scan++;
a0ed51b3
LW
4216 }
4217 break;
a0d0e21e 4218 case NDIGIT:
1aa99e6b 4219 if (do_utf8) {
ffc61ed2 4220 loceol = PL_regeol;
8269fa76 4221 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4222 while (hardcount < max && scan < loceol &&
3568d838 4223 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4224 scan += UTF8SKIP(scan);
4225 hardcount++;
4226 }
4227 } else {
4228 while (scan < loceol && !isDIGIT(*scan))
4229 scan++;
a0ed51b3
LW
4230 }
4231 break;
a0d0e21e
LW
4232 default: /* Called on something of 0 width. */
4233 break; /* So match right here or not at all. */
4234 }
a687059c 4235
a0ed51b3
LW
4236 if (hardcount)
4237 c = hardcount;
4238 else
4239 c = scan - PL_reginput;
3280af22 4240 PL_reginput = scan;
a687059c 4241
9041c2e3 4242 DEBUG_r(
c277df42
IZ
4243 {
4244 SV *prop = sv_newmortal();
4245
4246 regprop(prop, p);
9041c2e3
NIS
4247 PerlIO_printf(Perl_debug_log,
4248 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4249 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4250 });
9041c2e3 4251
a0d0e21e 4252 return(c);
a687059c
LW
4253}
4254
4255/*
c277df42 4256 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4257 *
c277df42
IZ
4258 * The repeater is supposed to have constant length.
4259 */
4260
76e3520e 4261STATIC I32
cea2e8a9 4262S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4263{
b7953727 4264 register char *scan = Nullch;
c277df42 4265 register char *start;
3280af22 4266 register char *loceol = PL_regeol;
a0ed51b3 4267 I32 l = 0;
708e3b05 4268 I32 count = 0, res = 1;
a0ed51b3
LW
4269
4270 if (!max)
4271 return 0;
c277df42 4272
3280af22 4273 start = PL_reginput;
53c4c00c 4274 if (PL_reg_match_utf8) {
708e3b05 4275 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4276 if (!count++) {
4277 l = 0;
4278 while (start < PL_reginput) {
4279 l++;
4280 start += UTF8SKIP(start);
4281 }
4282 *lp = l;
4283 if (l == 0)
4284 return max;
4285 }
4286 if (count == max)
4287 return count;
4288 }
4289 }
4290 else {
708e3b05 4291 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4292 if (!count++) {
4293 *lp = l = PL_reginput - start;
4294 if (max != REG_INFTY && l*max < loceol - scan)
4295 loceol = scan + l*max;
4296 if (l == 0)
4297 return max;
c277df42
IZ
4298 }
4299 }
4300 }
708e3b05 4301 if (!res)
3280af22 4302 PL_reginput = scan;
9041c2e3 4303
a0ed51b3 4304 return count;
c277df42
IZ
4305}
4306
4307/*
ffc61ed2
JH
4308- regclass_swash - prepare the utf8 swash
4309*/
4310
4311SV *
9e55ce06 4312Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4313{
9e55ce06
JH
4314 SV *sw = NULL;
4315 SV *si = NULL;
4316 SV *alt = NULL;
ffc61ed2
JH
4317
4318 if (PL_regdata && PL_regdata->count) {
4319 U32 n = ARG(node);
4320
4321 if (PL_regdata->what[n] == 's') {
4322 SV *rv = (SV*)PL_regdata->data[n];
4323 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4324 SV **ary = AvARRAY(av);
9e55ce06 4325 SV **a, **b;
9041c2e3 4326
9e55ce06
JH
4327 /* See the end of regcomp.c:S_reglass() for
4328 * documentation of these array elements. */
4329
b11f357e
JH
4330 si = *ary;
4331 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4332 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4333
ffc61ed2
JH
4334 if (a)
4335 sw = *a;
4336 else if (si && doinit) {
4337 sw = swash_init("utf8", "", si, 1, 0);
4338 (void)av_store(av, 1, sw);
4339 }
9e55ce06
JH
4340 if (b)
4341 alt = *b;
ffc61ed2
JH
4342 }
4343 }
4344
9e55ce06
JH
4345 if (listsvp)
4346 *listsvp = si;
4347 if (altsvp)
4348 *altsvp = alt;
ffc61ed2
JH
4349
4350 return sw;
4351}
4352
4353/*
ba7b4546 4354 - reginclass - determine if a character falls into a character class
832705d4
JH
4355
4356 The n is the ANYOF regnode, the p is the target string, lenp
4357 is pointer to the maximum length of how far to go in the p
4358 (if the lenp is zero, UTF8SKIP(p) is used),
4359 do_utf8 tells whether the target string is in UTF-8.
4360
bbce6d69 4361 */
4362
76e3520e 4363STATIC bool
ba7b4546 4364S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4365{
ffc61ed2 4366 char flags = ANYOF_FLAGS(n);
bbce6d69 4367 bool match = FALSE;
cc07378b 4368 UV c = *p;
ae9ddab8 4369 STRLEN len = 0;
9e55ce06 4370 STRLEN plen;
1aa99e6b 4371
ae9ddab8
JH
4372 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4373 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4374 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4375
0f0076b4 4376 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4377 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4378 if (lenp)
4379 *lenp = 0;
ffc61ed2 4380 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4381 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4382 match = TRUE;
bbce6d69 4383 }
3568d838 4384 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4385 match = TRUE;
ffc61ed2 4386 if (!match) {
9e55ce06
JH
4387 AV *av;
4388 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4389
4390 if (sw) {
3568d838 4391 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4392 match = TRUE;
4393 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4394 if (!match && lenp && av) {
4395 I32 i;
4396
4397 for (i = 0; i <= av_len(av); i++) {
4398 SV* sv = *av_fetch(av, i, FALSE);
4399 STRLEN len;
4400 char *s = SvPV(sv, len);
4401
061b10df 4402 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4403 *lenp = len;
4404 match = TRUE;
4405 break;
4406 }
4407 }
4408 }
4409 if (!match) {
4a623e43
JH
4410 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4411 STRLEN tmplen;
4412
9e55ce06
JH
4413 to_utf8_fold(p, tmpbuf, &tmplen);
4414 if (swash_fetch(sw, tmpbuf, do_utf8))
4415 match = TRUE;
4416 }
ffc61ed2
JH
4417 }
4418 }
bbce6d69 4419 }
9e55ce06 4420 if (match && lenp && *lenp == 0)
0f0076b4 4421 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4422 }
1aa99e6b 4423 if (!match && c < 256) {
ffc61ed2
JH
4424 if (ANYOF_BITMAP_TEST(n, c))
4425 match = TRUE;
4426 else if (flags & ANYOF_FOLD) {
eb160463 4427 U8 f;
a0ed51b3 4428
ffc61ed2
JH
4429 if (flags & ANYOF_LOCALE) {
4430 PL_reg_flags |= RF_tainted;
4431 f = PL_fold_locale[c];
4432 }
4433 else
4434 f = PL_fold[c];
4435 if (f != c && ANYOF_BITMAP_TEST(n, f))
4436 match = TRUE;
4437 }
4438
4439 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4440 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4441 if (
4442 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4443 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4444 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4445 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4446 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4447 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4448 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4449 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4450 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4451 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4452 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4453 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4454 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4455 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4456 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4457 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4458 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4459 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4460 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4472 ) /* How's that for a conditional? */
4473 {
4474 match = TRUE;
4475 }
a0ed51b3 4476 }
a0ed51b3
LW
4477 }
4478
a0ed51b3
LW
4479 return (flags & ANYOF_INVERT) ? !match : match;
4480}
161b471a 4481
dfe13c55 4482STATIC U8 *
cea2e8a9 4483S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4484{
1aa99e6b
IH
4485 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4486}
4487
4488STATIC U8 *
4489S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4490{
a0ed51b3 4491 if (off >= 0) {
1aa99e6b 4492 while (off-- && s < lim) {
ffc61ed2 4493 /* XXX could check well-formedness here */
a0ed51b3 4494 s += UTF8SKIP(s);
ffc61ed2 4495 }
a0ed51b3
LW
4496 }
4497 else {
4498 while (off++) {
1aa99e6b 4499 if (s > lim) {
a0ed51b3 4500 s--;
ffc61ed2 4501 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4502 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4503 s--;
ffc61ed2
JH
4504 }
4505 /* XXX could check well-formedness here */
a0ed51b3
LW
4506 }
4507 }
4508 }
4509 return s;
4510}
161b471a 4511
dfe13c55 4512STATIC U8 *
1aa99e6b 4513S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4514{
1aa99e6b
IH
4515 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4516}
4517
4518STATIC U8 *
4519S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4520{
4521 if (off >= 0) {
1aa99e6b 4522 while (off-- && s < lim) {
ffc61ed2 4523 /* XXX could check well-formedness here */
a0ed51b3 4524 s += UTF8SKIP(s);
ffc61ed2 4525 }
a0ed51b3
LW
4526 if (off >= 0)
4527 return 0;
4528 }
4529 else {
4530 while (off++) {
1aa99e6b 4531 if (s > lim) {
a0ed51b3 4532 s--;
ffc61ed2 4533 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4534 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4535 s--;
ffc61ed2
JH
4536 }
4537 /* XXX could check well-formedness here */
a0ed51b3
LW
4538 }
4539 else
4540 break;
4541 }
4542 if (off <= 0)
4543 return 0;
4544 }
4545 return s;
4546}
51371543 4547
51371543 4548static void
acfe0abc 4549restore_pos(pTHX_ void *arg)
51371543 4550{
51371543
GS
4551 if (PL_reg_eval_set) {
4552 if (PL_reg_oldsaved) {
4553 PL_reg_re->subbeg = PL_reg_oldsaved;
4554 PL_reg_re->sublen = PL_reg_oldsavedlen;
ed252734
NC
4555#ifdef PERL_COPY_ON_WRITE
4556 PL_reg_re->saved_copy = PL_nrs;
4557#endif
51371543
GS
4558 RX_MATCH_COPIED_on(PL_reg_re);
4559 }
4560 PL_reg_magic->mg_len = PL_reg_oldpos;
4561 PL_reg_eval_set = 0;
4562 PL_curpm = PL_reg_oldcurpm;
4563 }
4564}
33b8afdf
JH
4565
4566STATIC void
4567S_to_utf8_substr(pTHX_ register regexp *prog)
4568{
4569 SV* sv;
4570 if (prog->float_substr && !prog->float_utf8) {
4571 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4572 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4573 sv_utf8_upgrade(sv);
4574 if (SvTAIL(prog->float_substr))
4575 SvTAIL_on(sv);
4576 if (prog->float_substr == prog->check_substr)
4577 prog->check_utf8 = sv;
4578 }
4579 if (prog->anchored_substr && !prog->anchored_utf8) {
4580 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4581 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4582 sv_utf8_upgrade(sv);
4583 if (SvTAIL(prog->anchored_substr))
4584 SvTAIL_on(sv);
4585 if (prog->anchored_substr == prog->check_substr)
4586 prog->check_utf8 = sv;
4587 }
4588}
4589
4590STATIC void
4591S_to_byte_substr(pTHX_ register regexp *prog)
4592{
4593 SV* sv;
4594 if (prog->float_utf8 && !prog->float_substr) {
4595 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4596 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4597 if (sv_utf8_downgrade(sv, TRUE)) {
4598 if (SvTAIL(prog->float_utf8))
4599 SvTAIL_on(sv);
4600 } else {
4601 SvREFCNT_dec(sv);
4602 prog->float_substr = sv = &PL_sv_undef;
4603 }
4604 if (prog->float_utf8 == prog->check_utf8)
4605 prog->check_substr = sv;
4606 }
4607 if (prog->anchored_utf8 && !prog->anchored_substr) {
4608 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4609 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4610 if (sv_utf8_downgrade(sv, TRUE)) {
4611 if (SvTAIL(prog->anchored_utf8))
4612 SvTAIL_on(sv);
4613 } else {
4614 SvREFCNT_dec(sv);
4615 prog->anchored_substr = sv = &PL_sv_undef;
4616 }
4617 if (prog->anchored_utf8 == prog->check_utf8)
4618 prog->check_substr = sv;
4619 }
4620}