This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
for pod/perlfaq2.pod against latest snapshot for Alpaca
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
4bb101f2
JH
70 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
71 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 72 ****
9ef589d8
LW
73 **** You may distribute under the terms of either the GNU General Public
74 **** License or the Artistic License, as specified in the README file.
a687059c
LW
75 *
76 * Beware that some of this code is subtly aware of the way operator
77 * precedence is structured in regular expressions. Serious changes in
78 * regular-expression syntax might require a total rethink.
79 */
80#include "EXTERN.h"
864dbfa3 81#define PERL_IN_REGEXEC_C
a687059c 82#include "perl.h"
0f5d15d6 83
a687059c
LW
84#include "regcomp.h"
85
c277df42
IZ
86#define RF_tainted 1 /* tainted information used? */
87#define RF_warned 2 /* warned about big count? */
ce862d02 88#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
89#define RF_utf8 8 /* String contains multibyte chars? */
90
eb160463 91#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
92
93#define RS_init 1 /* eval environment created */
94#define RS_set 2 /* replsv value is set */
c277df42 95
a687059c
LW
96#ifndef STATIC
97#define STATIC static
98#endif
99
ba7b4546 100#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 101
c277df42
IZ
102/*
103 * Forwards.
104 */
105
33b8afdf 106#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 107#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 108
dfe13c55
GS
109#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
110#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
111#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
112#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
113#define HOPc(pos,off) ((char*)HOP(pos,off))
114#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 115
efb30f32 116#define HOPBACK(pos, off) ( \
e54858b0 117 (PL_reg_match_utf8) \
efb30f32
HS
118 ? reghopmaybe((U8*)pos, -off) \
119 : (pos - off >= PL_bostr) \
120 ? (U8*)(pos - off) \
121 : (U8*)NULL \
122)
123#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
124
1aa99e6b
IH
125#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
126#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
127#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
128#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
129#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
130#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
131
977e4b84 132#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 133
5f80c4cf 134/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
135#define JUMPABLE(rn) ( \
136 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
137 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
138 OP(rn) == PLUS || OP(rn) == MINMOD || \
139 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
140)
141
cca55fe3
JP
142#define HAS_TEXT(rn) ( \
143 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
144)
e2d8ce26 145
a84d97b6
HS
146/*
147 Search for mandatory following text node; for lookahead, the text must
148 follow but for lookbehind (rn->flags != 0) we skip to the next step.
149*/
cca55fe3 150#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 151 while (JUMPABLE(rn)) \
a84d97b6 152 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 153 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
154 else if (OP(rn) == PLUS) \
155 rn = NEXTOPER(rn); \
a84d97b6
HS
156 else if (OP(rn) == IFMATCH) \
157 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 158 else rn += NEXT_OFF(rn); \
5f80c4cf 159} STMT_END
74750237 160
acfe0abc 161static void restore_pos(pTHX_ void *arg);
51371543 162
76e3520e 163STATIC CHECKPOINT
cea2e8a9 164S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 165{
3280af22 166 int retval = PL_savestack_ix;
b1ce53c5
JH
167#define REGCP_PAREN_ELEMS 4
168 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
169 int p;
170
e49a9654
IH
171 if (paren_elems_to_push < 0)
172 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
173
a01268b5 174#define REGCP_OTHER_ELEMS 6
4b3c1a47 175 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 176 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 177/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
178 SSPUSHINT(PL_regendp[p]);
179 SSPUSHINT(PL_regstartp[p]);
3280af22 180 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
181 SSPUSHINT(p);
182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
02db2b7b 198# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 199 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 200 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 201
02db2b7b 202# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 203 PerlIO_printf(Perl_debug_log, \
faccc32b 204 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 205 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 206
76e3520e 207STATIC char *
cea2e8a9 208S_regcppop(pTHX)
a0d0e21e 209{
b1ce53c5 210 I32 i;
a0d0e21e
LW
211 U32 paren = 0;
212 char *input;
cf93c79d 213 I32 tmps;
b1ce53c5
JH
214
215 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 216 i = SSPOPINT;
b1ce53c5
JH
217 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
218 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 219 input = (char *) SSPOPPTR;
a01268b5 220 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
221 *PL_reglastparen = SSPOPINT;
222 PL_regsize = SSPOPINT;
b1ce53c5
JH
223
224 /* Now restore the parentheses context. */
41123dfd
JH
225 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
226 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 227 paren = (U32)SSPOPINT;
3280af22 228 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
229 PL_regstartp[paren] = SSPOPINT;
230 tmps = SSPOPINT;
3280af22
NIS
231 if (paren <= *PL_reglastparen)
232 PL_regendp[paren] = tmps;
c277df42 233 DEBUG_r(
c3464db5 234 PerlIO_printf(Perl_debug_log,
b900a521 235 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 236 (UV)paren, (IV)PL_regstartp[paren],
b900a521 237 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 238 (IV)PL_regendp[paren],
3280af22 239 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 240 );
a0d0e21e 241 }
c277df42 242 DEBUG_r(
eb160463 243 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 244 PerlIO_printf(Perl_debug_log,
faccc32b
JH
245 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
246 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
247 }
248 );
daf18116 249#if 1
dafc8851
JH
250 /* It would seem that the similar code in regtry()
251 * already takes care of this, and in fact it is in
252 * a better location to since this code can #if 0-ed out
253 * but the code in regtry() is needed or otherwise tests
254 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
255 * (as of patchlevel 7877) will fail. Then again,
256 * this code seems to be necessary or otherwise
257 * building DynaLoader will fail:
258 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
259 * --jhi */
eb160463
GS
260 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
261 if ((I32)paren > PL_regsize)
cf93c79d
IZ
262 PL_regstartp[paren] = -1;
263 PL_regendp[paren] = -1;
a0d0e21e 264 }
dafc8851 265#endif
a0d0e21e
LW
266 return input;
267}
268
0f5d15d6 269STATIC char *
cea2e8a9 270S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
271{
272 I32 tmp = PL_savestack_ix;
273
274 PL_savestack_ix = ss;
275 regcppop();
276 PL_savestack_ix = tmp;
942e002e 277 return Nullch;
0f5d15d6
IZ
278}
279
280typedef struct re_cc_state
281{
282 I32 ss;
283 regnode *node;
284 struct re_cc_state *prev;
285 CURCUR *cc;
286 regexp *re;
287} re_cc_state;
288
02db2b7b 289#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 290
29d1e993
HS
291#define TRYPAREN(paren, n, input) { \
292 if (paren) { \
293 if (n) { \
294 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
295 PL_regendp[paren] = input - PL_bostr; \
296 } \
297 else \
298 PL_regendp[paren] = -1; \
299 } \
300 if (regmatch(next)) \
301 sayYES; \
302 if (paren && n) \
303 PL_regendp[paren] = -1; \
304}
305
306
a687059c 307/*
e50aee73 308 * pregexec and friends
a687059c
LW
309 */
310
311/*
c277df42 312 - pregexec - match a regexp against a string
a687059c 313 */
c277df42 314I32
864dbfa3 315Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 316 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
317/* strend: pointer to null at end of string */
318/* strbeg: real beginning of string */
319/* minend: end of match must be >=minend after stringarg. */
320/* nosave: For optimizations. */
321{
322 return
9041c2e3 323 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
324 nosave ? 0 : REXEC_COPY_STR);
325}
0f5d15d6
IZ
326
327STATIC void
cea2e8a9 328S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
329{
330 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
331#ifdef DEBUGGING
332 PL_regprogram = prog->program;
333#endif
334 PL_regnpar = prog->nparens;
9041c2e3
NIS
335 PL_regdata = prog->data;
336 PL_reg_re = prog;
0f5d15d6 337}
22e551b9 338
9041c2e3 339/*
cad2e5aa
JH
340 * Need to implement the following flags for reg_anch:
341 *
342 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
343 * USE_INTUIT_ML
344 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
345 * INTUIT_AUTORITATIVE_ML
346 * INTUIT_ONCE_NOML - Intuit can match in one location only.
347 * INTUIT_ONCE_ML
348 *
349 * Another flag for this function: SECOND_TIME (so that float substrs
350 * with giant delta may be not rechecked).
351 */
352
353/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
354
2c2d71f5 355/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
356 Otherwise, only SvCUR(sv) is used to get strbeg. */
357
358/* XXXX We assume that strpos is strbeg unless sv. */
359
6eb5f6b9
JH
360/* XXXX Some places assume that there is a fixed substring.
361 An update may be needed if optimizer marks as "INTUITable"
362 RExen without fixed substrings. Similarly, it is assumed that
363 lengths of all the strings are no more than minlen, thus they
364 cannot come from lookahead.
365 (Or minlen should take into account lookahead.) */
366
2c2d71f5
JH
367/* A failure to find a constant substring means that there is no need to make
368 an expensive call to REx engine, thus we celebrate a failure. Similarly,
369 finding a substring too deep into the string means that less calls to
30944b6d
IZ
370 regtry() should be needed.
371
372 REx compiler's optimizer found 4 possible hints:
373 a) Anchored substring;
374 b) Fixed substring;
375 c) Whether we are anchored (beginning-of-line or \G);
376 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 377 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
378 string which does not contradict any of them.
379 */
2c2d71f5 380
6eb5f6b9
JH
381/* Most of decisions we do here should have been done at compile time.
382 The nodes of the REx which we used for the search should have been
383 deleted from the finite automaton. */
384
cad2e5aa
JH
385char *
386Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
387 char *strend, U32 flags, re_scream_pos_data *data)
388{
b7953727 389 register I32 start_shift = 0;
cad2e5aa 390 /* Should be nonnegative! */
b7953727 391 register I32 end_shift = 0;
2c2d71f5
JH
392 register char *s;
393 register SV *check;
a1933d95 394 char *strbeg;
cad2e5aa 395 char *t;
33b8afdf 396 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 397 I32 ml_anch;
6eb5f6b9 398 register char *other_last = Nullch; /* other substr checked before this */
b7953727 399 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
400#ifdef DEBUGGING
401 char *i_strpos = strpos;
ce333219 402 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 403#endif
a30b2f1f 404 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 405
b8d68ded
JH
406 if (prog->reganch & ROPT_UTF8) {
407 DEBUG_r(PerlIO_printf(Perl_debug_log,
408 "UTF-8 regex...\n"));
409 PL_reg_flags |= RF_utf8;
410 }
411
2a782b5b 412 DEBUG_r({
b8d68ded 413 char *s = PL_reg_match_utf8 ?
c728cb41
JH
414 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
415 strpos;
b8d68ded
JH
416 int len = PL_reg_match_utf8 ?
417 strlen(s) : strend - strpos;
2a782b5b
JH
418 if (!PL_colorset)
419 reginitcolors();
b8d68ded
JH
420 if (PL_reg_match_utf8)
421 DEBUG_r(PerlIO_printf(Perl_debug_log,
422 "UTF-8 target...\n"));
2a782b5b
JH
423 PerlIO_printf(Perl_debug_log,
424 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
425 PL_colors[4],PL_colors[5],PL_colors[0],
426 prog->precomp,
427 PL_colors[1],
428 (strlen(prog->precomp) > 60 ? "..." : ""),
429 PL_colors[0],
430 (int)(len > 60 ? 60 : len),
431 s, PL_colors[1],
432 (len > 60 ? "..." : "")
433 );
434 });
cad2e5aa 435
c344f387
JH
436 /* CHR_DIST() would be more correct here but it makes things slow. */
437 if (prog->minlen > strend - strpos) {
a72c7584
JH
438 DEBUG_r(PerlIO_printf(Perl_debug_log,
439 "String too short... [re_intuit_start]\n"));
cad2e5aa 440 goto fail;
2c2d71f5 441 }
a1933d95 442 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 443 PL_regeol = strend;
33b8afdf
JH
444 if (do_utf8) {
445 if (!prog->check_utf8 && prog->check_substr)
446 to_utf8_substr(prog);
447 check = prog->check_utf8;
448 } else {
449 if (!prog->check_substr && prog->check_utf8)
450 to_byte_substr(prog);
451 check = prog->check_substr;
452 }
453 if (check == &PL_sv_undef) {
454 DEBUG_r(PerlIO_printf(Perl_debug_log,
455 "Non-utf string cannot match utf check string\n"));
456 goto fail;
457 }
2c2d71f5 458 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
459 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
460 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 461 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 462
7e25d62c
JH
463 if (!ml_anch) {
464 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
465 | ROPT_IMPLICIT)) /* not a real BOL */
466 /* SvCUR is not set on references: SvRV and SvPVX overlap */
467 && sv && !SvROK(sv)
468 && (strpos != strbeg)) {
469 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
470 goto fail;
471 }
472 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 473 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 474 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
475 I32 slen;
476
1aa99e6b 477 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
478 if (SvTAIL(check)) {
479 slen = SvCUR(check); /* >= 1 */
cad2e5aa 480
9041c2e3 481 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
482 || (strend - s == slen && strend[-1] != '\n')) {
483 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
484 goto fail_finish;
cad2e5aa
JH
485 }
486 /* Now should match s[0..slen-2] */
487 slen--;
653099ff 488 if (slen && (*SvPVX(check) != *s
cad2e5aa 489 || (slen > 1
653099ff 490 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
491 report_neq:
492 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
493 goto fail_finish;
494 }
cad2e5aa 495 }
653099ff
GS
496 else if (*SvPVX(check) != *s
497 || ((slen = SvCUR(check)) > 1
498 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
499 goto report_neq;
500 goto success_at_start;
7e25d62c 501 }
cad2e5aa 502 }
2c2d71f5 503 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 504 s = strpos;
2c2d71f5 505 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 506 end_shift = prog->minlen - start_shift -
653099ff 507 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 508 if (!ml_anch) {
653099ff
GS
509 I32 end = prog->check_offset_max + CHR_SVLEN(check)
510 - (SvTAIL(check) != 0);
1aa99e6b 511 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
512
513 if (end_shift < eshift)
514 end_shift = eshift;
515 }
cad2e5aa 516 }
2c2d71f5 517 else { /* Can match at random position */
cad2e5aa
JH
518 ml_anch = 0;
519 s = strpos;
2c2d71f5
JH
520 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
521 /* Should be nonnegative! */
522 end_shift = prog->minlen - start_shift -
653099ff 523 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
524 }
525
2c2d71f5 526#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 527 if (end_shift < 0)
6bbae5e6 528 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
529#endif
530
2c2d71f5
JH
531 restart:
532 /* Find a possible match in the region s..strend by looking for
533 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 534 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
535 I32 p = -1; /* Internal iterator of scream. */
536 I32 *pp = data ? data->scream_pos : &p;
537
2c2d71f5
JH
538 if (PL_screamfirst[BmRARE(check)] >= 0
539 || ( BmRARE(check) == '\n'
540 && (BmPREVIOUS(check) == SvCUR(check) - 1)
541 && SvTAIL(check) ))
9041c2e3 542 s = screaminstr(sv, check,
2c2d71f5 543 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 544 else
2c2d71f5 545 goto fail_finish;
4addbd3b
HS
546 /* we may be pointing at the wrong string */
547 if (s && RX_MATCH_COPIED(prog))
7ef91622 548 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
549 if (data)
550 *data->scream_olds = s;
551 }
f33976b4 552 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
553 s = fbm_instr((U8*)(s + start_shift),
554 (U8*)(strend - end_shift),
555 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 556 else
1aa99e6b
IH
557 s = fbm_instr(HOP3(s, start_shift, strend),
558 HOP3(strend, -end_shift, strbeg),
2c2d71f5 559 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
560
561 /* Update the count-of-usability, remove useless subpatterns,
562 unshift s. */
2c2d71f5
JH
563
564 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
565 (s ? "Found" : "Did not find"),
33b8afdf 566 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 567 PL_colors[0],
7b0972df
JH
568 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
569 SvPVX(check),
2c2d71f5
JH
570 PL_colors[1], (SvTAIL(check) ? "$" : ""),
571 (s ? " at offset " : "...\n") ) );
572
573 if (!s)
574 goto fail_finish;
575
6eb5f6b9
JH
576 check_at = s;
577
2c2d71f5 578 /* Finish the diagnostic message */
30944b6d 579 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
580
581 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
582 Start with the other substr.
583 XXXX no SCREAM optimization yet - and a very coarse implementation
584 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
585 *always* match. Probably should be marked during compile...
586 Probably it is right to do no SCREAM here...
587 */
588
33b8afdf 589 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 590 /* Take into account the "other" substring. */
2c2d71f5
JH
591 /* XXXX May be hopelessly wrong for UTF... */
592 if (!other_last)
6eb5f6b9 593 other_last = strpos;
33b8afdf 594 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
595 do_other_anchored:
596 {
1aa99e6b 597 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 598 char *s1 = s;
33b8afdf 599 SV* must;
2c2d71f5 600
2c2d71f5
JH
601 t = s - prog->check_offset_max;
602 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 603 && (!do_utf8
1aa99e6b 604 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 605 && t > strpos)))
30944b6d 606 /* EMPTY */;
2c2d71f5
JH
607 else
608 t = strpos;
1aa99e6b 609 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
610 if (t < other_last) /* These positions already checked */
611 t = other_last;
1aa99e6b 612 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
613 if (last < last1)
614 last1 = last;
615 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
616 /* On end-of-str: see comment below. */
33b8afdf
JH
617 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
618 if (must == &PL_sv_undef) {
619 s = (char*)NULL;
620 DEBUG_r(must = prog->anchored_utf8); /* for debug */
621 }
622 else
623 s = fbm_instr(
624 (unsigned char*)t,
625 HOP3(HOP3(last1, prog->anchored_offset, strend)
626 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
627 must,
628 PL_multiline ? FBMrf_MULTILINE : 0
629 );
1aa99e6b
IH
630 DEBUG_r(PerlIO_printf(Perl_debug_log,
631 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
632 (s ? "Found" : "Contradicts"),
633 PL_colors[0],
33b8afdf
JH
634 (int)(SvCUR(must)
635 - (SvTAIL(must)!=0)),
636 SvPVX(must),
637 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
638 if (!s) {
639 if (last1 >= last2) {
640 DEBUG_r(PerlIO_printf(Perl_debug_log,
641 ", giving up...\n"));
642 goto fail_finish;
643 }
644 DEBUG_r(PerlIO_printf(Perl_debug_log,
645 ", trying floating at offset %ld...\n",
1aa99e6b
IH
646 (long)(HOP3c(s1, 1, strend) - i_strpos)));
647 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
648 s = HOP3c(last, 1, strend);
2c2d71f5
JH
649 goto restart;
650 }
651 else {
652 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 653 (long)(s - i_strpos)));
1aa99e6b
IH
654 t = HOP3c(s, -prog->anchored_offset, strbeg);
655 other_last = HOP3c(s, 1, strend);
30944b6d 656 s = s1;
2c2d71f5
JH
657 if (t == strpos)
658 goto try_at_start;
2c2d71f5
JH
659 goto try_at_offset;
660 }
30944b6d 661 }
2c2d71f5
JH
662 }
663 else { /* Take into account the floating substring. */
33b8afdf
JH
664 char *last, *last1;
665 char *s1 = s;
666 SV* must;
667
668 t = HOP3c(s, -start_shift, strbeg);
669 last1 = last =
670 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
671 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
672 last = HOP3c(t, prog->float_max_offset, strend);
673 s = HOP3c(t, prog->float_min_offset, strend);
674 if (s < other_last)
675 s = other_last;
2c2d71f5 676 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
677 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
678 /* fbm_instr() takes into account exact value of end-of-str
679 if the check is SvTAIL(ed). Since false positives are OK,
680 and end-of-str is not later than strend we are OK. */
681 if (must == &PL_sv_undef) {
682 s = (char*)NULL;
683 DEBUG_r(must = prog->float_utf8); /* for debug message */
684 }
685 else
2c2d71f5 686 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
687 (unsigned char*)last + SvCUR(must)
688 - (SvTAIL(must)!=0),
689 must, PL_multiline ? FBMrf_MULTILINE : 0);
690 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
691 (s ? "Found" : "Contradicts"),
692 PL_colors[0],
693 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
694 SvPVX(must),
695 PL_colors[1], (SvTAIL(must) ? "$" : "")));
696 if (!s) {
697 if (last1 == last) {
2c2d71f5 698 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
699 ", giving up...\n"));
700 goto fail_finish;
2c2d71f5 701 }
33b8afdf
JH
702 DEBUG_r(PerlIO_printf(Perl_debug_log,
703 ", trying anchored starting at offset %ld...\n",
704 (long)(s1 + 1 - i_strpos)));
705 other_last = last;
706 s = HOP3c(t, 1, strend);
707 goto restart;
708 }
709 else {
710 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
711 (long)(s - i_strpos)));
712 other_last = s; /* Fix this later. --Hugo */
713 s = s1;
714 if (t == strpos)
715 goto try_at_start;
716 goto try_at_offset;
717 }
2c2d71f5 718 }
cad2e5aa 719 }
2c2d71f5
JH
720
721 t = s - prog->check_offset_max;
2c2d71f5 722 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 723 && (!do_utf8
1aa99e6b
IH
724 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
725 && t > strpos))) {
2c2d71f5
JH
726 /* Fixed substring is found far enough so that the match
727 cannot start at strpos. */
728 try_at_offset:
cad2e5aa 729 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
730 /* Eventually fbm_*() should handle this, but often
731 anchored_offset is not 0, so this check will not be wasted. */
732 /* XXXX In the code below we prefer to look for "^" even in
733 presence of anchored substrings. And we search even
734 beyond the found float position. These pessimizations
735 are historical artefacts only. */
736 find_anchor:
2c2d71f5 737 while (t < strend - prog->minlen) {
cad2e5aa 738 if (*t == '\n') {
4ee3650e 739 if (t < check_at - prog->check_offset_min) {
33b8afdf 740 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
741 /* Since we moved from the found position,
742 we definitely contradict the found anchored
30944b6d
IZ
743 substr. Due to the above check we do not
744 contradict "check" substr.
745 Thus we can arrive here only if check substr
746 is float. Redo checking for "other"=="fixed".
747 */
9041c2e3 748 strpos = t + 1;
30944b6d
IZ
749 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
750 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
751 goto do_other_anchored;
752 }
4ee3650e
GS
753 /* We don't contradict the found floating substring. */
754 /* XXXX Why not check for STCLASS? */
cad2e5aa 755 s = t + 1;
2c2d71f5 756 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 757 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
758 goto set_useful;
759 }
4ee3650e
GS
760 /* Position contradicts check-string */
761 /* XXXX probably better to look for check-string
762 than for "\n", so one should lower the limit for t? */
763 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 764 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 765 other_last = strpos = s = t + 1;
cad2e5aa
JH
766 goto restart;
767 }
768 t++;
769 }
2c2d71f5
JH
770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
771 PL_colors[0],PL_colors[1]));
772 goto fail_finish;
cad2e5aa 773 }
f5952150
GS
774 else {
775 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
776 PL_colors[0],PL_colors[1]));
777 }
cad2e5aa
JH
778 s = t;
779 set_useful:
33b8afdf 780 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
781 }
782 else {
f5952150 783 /* The found string does not prohibit matching at strpos,
2c2d71f5 784 - no optimization of calling REx engine can be performed,
f5952150
GS
785 unless it was an MBOL and we are not after MBOL,
786 or a future STCLASS check will fail this. */
2c2d71f5
JH
787 try_at_start:
788 /* Even in this situation we may use MBOL flag if strpos is offset
789 wrt the start of the string. */
05b4157f 790 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 791 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
792 /* May be due to an implicit anchor of m{.*foo} */
793 && !(prog->reganch & ROPT_IMPLICIT))
794 {
cad2e5aa
JH
795 t = strpos;
796 goto find_anchor;
797 }
30944b6d 798 DEBUG_r( if (ml_anch)
f5952150
GS
799 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
800 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 801 );
2c2d71f5 802 success_at_start:
30944b6d 803 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
804 && (do_utf8 ? (
805 prog->check_utf8 /* Could be deleted already */
806 && --BmUSEFUL(prog->check_utf8) < 0
807 && (prog->check_utf8 == prog->float_utf8)
808 ) : (
809 prog->check_substr /* Could be deleted already */
810 && --BmUSEFUL(prog->check_substr) < 0
811 && (prog->check_substr == prog->float_substr)
812 )))
66e933ab 813 {
cad2e5aa 814 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 815 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
816 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
817 if (do_utf8 ? prog->check_substr : prog->check_utf8)
818 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
819 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
820 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 821 check = Nullsv; /* abort */
cad2e5aa 822 s = strpos;
3cf5c195
IZ
823 /* XXXX This is a remnant of the old implementation. It
824 looks wasteful, since now INTUIT can use many
6eb5f6b9 825 other heuristics. */
cad2e5aa
JH
826 prog->reganch &= ~RE_USE_INTUIT;
827 }
828 else
829 s = strpos;
830 }
831
6eb5f6b9
JH
832 /* Last resort... */
833 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
834 if (prog->regstclass) {
835 /* minlen == 0 is possible if regstclass is \b or \B,
836 and the fixed substr is ''$.
837 Since minlen is already taken into account, s+1 is before strend;
838 accidentally, minlen >= 1 guaranties no false positives at s + 1
839 even for \b or \B. But (minlen? 1 : 0) below assumes that
840 regstclass does not come from lookahead... */
841 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
842 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 843 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 844 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 845 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 846 : 1);
33b8afdf 847 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 848 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 849 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
850 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
851 cl_l, strend)
852 : strend);
a1933d95 853 char *startpos = strbeg;
6eb5f6b9
JH
854
855 t = s;
9926ca43 856 cache_re(prog);
f33976b4 857 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
858 if (!s) {
859#ifdef DEBUGGING
b7953727 860 char *what = 0;
6eb5f6b9
JH
861#endif
862 if (endpos == strend) {
863 DEBUG_r( PerlIO_printf(Perl_debug_log,
864 "Could not match STCLASS...\n") );
865 goto fail;
866 }
66e933ab
GS
867 DEBUG_r( PerlIO_printf(Perl_debug_log,
868 "This position contradicts STCLASS...\n") );
653099ff
GS
869 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
870 goto fail;
6eb5f6b9 871 /* Contradict one of substrings */
33b8afdf
JH
872 if (prog->anchored_substr || prog->anchored_utf8) {
873 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
874 DEBUG_r( what = "anchored" );
875 hop_and_restart:
1aa99e6b 876 s = HOP3c(t, 1, strend);
66e933ab
GS
877 if (s + start_shift + end_shift > strend) {
878 /* XXXX Should be taken into account earlier? */
879 DEBUG_r( PerlIO_printf(Perl_debug_log,
880 "Could not match STCLASS...\n") );
881 goto fail;
882 }
5e39e1e5
HS
883 if (!check)
884 goto giveup;
6eb5f6b9 885 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 886 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
887 what, (long)(s + start_shift - i_strpos)) );
888 goto restart;
889 }
66e933ab 890 /* Have both, check_string is floating */
6eb5f6b9
JH
891 if (t + start_shift >= check_at) /* Contradicts floating=check */
892 goto retry_floating_check;
893 /* Recheck anchored substring, but not floating... */
9041c2e3 894 s = check_at;
5e39e1e5
HS
895 if (!check)
896 goto giveup;
6eb5f6b9 897 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 898 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
899 (long)(other_last - i_strpos)) );
900 goto do_other_anchored;
901 }
60e71179
GS
902 /* Another way we could have checked stclass at the
903 current position only: */
904 if (ml_anch) {
905 s = t = t + 1;
5e39e1e5
HS
906 if (!check)
907 goto giveup;
60e71179 908 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
909 "Looking for /%s^%s/m starting at offset %ld...\n",
910 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 911 goto try_at_offset;
66e933ab 912 }
33b8afdf 913 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 914 goto fail;
6eb5f6b9
JH
915 /* Check is floating subtring. */
916 retry_floating_check:
917 t = check_at - start_shift;
918 DEBUG_r( what = "floating" );
919 goto hop_and_restart;
920 }
b7953727
JH
921 if (t != s) {
922 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 923 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
924 (long)(t - i_strpos), (long)(s - i_strpos))
925 );
926 }
927 else {
928 DEBUG_r(PerlIO_printf(Perl_debug_log,
929 "Does not contradict STCLASS...\n");
930 );
931 }
6eb5f6b9 932 }
5e39e1e5
HS
933 giveup:
934 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
935 PL_colors[4], (check ? "Guessed" : "Giving up"),
936 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 937 return s;
2c2d71f5
JH
938
939 fail_finish: /* Substring not found */
33b8afdf
JH
940 if (prog->check_substr || prog->check_utf8) /* could be removed already */
941 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 942 fail:
2c2d71f5 943 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
944 PL_colors[4],PL_colors[5]));
945 return Nullch;
946}
9661b544 947
6eb5f6b9 948/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
949STATIC char *
950S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 951{
6eb5f6b9
JH
952 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
953 char *m;
d8093b23
G
954 STRLEN ln;
955 unsigned int c1;
956 unsigned int c2;
6eb5f6b9
JH
957 char *e;
958 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 959 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 960
6eb5f6b9
JH
961 /* We know what class it must start with. */
962 switch (OP(c)) {
6eb5f6b9 963 case ANYOF:
388cc4de
HS
964 if (do_utf8) {
965 while (s < strend) {
966 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
967 !UTF8_IS_INVARIANT((U8)s[0]) ?
968 reginclass(c, (U8*)s, 0, do_utf8) :
969 REGINCLASS(c, (U8*)s)) {
970 if (tmp && (norun || regtry(prog, s)))
971 goto got_it;
972 else
973 tmp = doevery;
974 }
975 else
976 tmp = 1;
977 s += UTF8SKIP(s);
978 }
979 }
980 else {
981 while (s < strend) {
982 STRLEN skip = 1;
983
984 if (REGINCLASS(c, (U8*)s) ||
985 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
986 /* The assignment of 2 is intentional:
987 * for the folded sharp s, the skip is 2. */
988 (skip = SHARP_S_SKIP))) {
989 if (tmp && (norun || regtry(prog, s)))
990 goto got_it;
991 else
992 tmp = doevery;
993 }
994 else
995 tmp = 1;
996 s += skip;
997 }
a0d0e21e 998 }
6eb5f6b9 999 break;
f33976b4
DB
1000 case CANY:
1001 while (s < strend) {
1002 if (tmp && (norun || regtry(prog, s)))
1003 goto got_it;
1004 else
1005 tmp = doevery;
1006 s++;
1007 }
1008 break;
6eb5f6b9
JH
1009 case EXACTF:
1010 m = STRING(c);
1011 ln = STR_LEN(c);
1aa99e6b 1012 if (UTF) {
a2a2844f 1013 STRLEN ulen1, ulen2;
e7ae6809
JH
1014 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1015 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1016
1017 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1018 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1019
872c91ae
JH
1020 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1021 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1022 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1023 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1024 }
1025 else {
1026 c1 = *(U8*)m;
1027 c2 = PL_fold[c1];
1028 }
6eb5f6b9
JH
1029 goto do_exactf;
1030 case EXACTFL:
1031 m = STRING(c);
1032 ln = STR_LEN(c);
d8093b23 1033 c1 = *(U8*)m;
6eb5f6b9
JH
1034 c2 = PL_fold_locale[c1];
1035 do_exactf:
6795a346 1036 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1037
6eb5f6b9
JH
1038 if (norun && e < s)
1039 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1040
60a8b682
JH
1041 /* The idea in the EXACTF* cases is to first find the
1042 * first character of the EXACTF* node and then, if
1043 * necessary, case-insensitively compare the full
1044 * text of the node. The c1 and c2 are the first
1045 * characters (though in Unicode it gets a bit
1046 * more complicated because there are more cases
7f16dd3d
JH
1047 * than just upper and lower: one needs to use
1048 * the so-called folding case for case-insensitive
1049 * matching (called "loose matching" in Unicode).
1050 * ibcmp_utf8() will do just that. */
60a8b682 1051
1aa99e6b 1052 if (do_utf8) {
575cac57
JH
1053 UV c, f;
1054 U8 tmpbuf [UTF8_MAXLEN+1];
1055 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1056 STRLEN len, foldlen;
d7f013c8 1057
09091399 1058 if (c1 == c2) {
1aa99e6b 1059 while (s <= e) {
872c91ae
JH
1060 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1061 ckWARN(WARN_UTF8) ?
1062 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1063 if ( c == c1
1064 && (ln == len ||
66423254 1065 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1066 m, (char **)0, ln, (bool)UTF))
55da9344 1067 && (norun || regtry(prog, s)) )
1aa99e6b 1068 goto got_it;
80aecb99
JH
1069 else {
1070 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1071 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1072 if ( f != c
1073 && (f == c1 || f == c2)
1074 && (ln == foldlen ||
66423254
JH
1075 !ibcmp_utf8((char *) foldbuf,
1076 (char **)0, foldlen, do_utf8,
d07ddd77 1077 m,
eb160463 1078 (char **)0, ln, (bool)UTF))
80aecb99
JH
1079 && (norun || regtry(prog, s)) )
1080 goto got_it;
1081 }
1aa99e6b
IH
1082 s += len;
1083 }
09091399
JH
1084 }
1085 else {
1aa99e6b 1086 while (s <= e) {
872c91ae
JH
1087 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1088 ckWARN(WARN_UTF8) ?
1089 0 : UTF8_ALLOW_ANY);
80aecb99 1090
60a8b682 1091 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1092 * Note that not all the possible combinations
1093 * are handled here: some of them are handled
1094 * by the standard folding rules, and some of
1095 * them (the character class or ANYOF cases)
1096 * are handled during compiletime in
1097 * regexec.c:S_regclass(). */
880bd946
JH
1098 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1099 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1100 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1101
1102 if ( (c == c1 || c == c2)
1103 && (ln == len ||
66423254 1104 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1105 m, (char **)0, ln, (bool)UTF))
55da9344 1106 && (norun || regtry(prog, s)) )
1aa99e6b 1107 goto got_it;
80aecb99
JH
1108 else {
1109 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1110 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1111 if ( f != c
1112 && (f == c1 || f == c2)
1113 && (ln == foldlen ||
a6872d42 1114 !ibcmp_utf8((char *) foldbuf,
66423254 1115 (char **)0, foldlen, do_utf8,
d07ddd77 1116 m,
eb160463 1117 (char **)0, ln, (bool)UTF))
80aecb99
JH
1118 && (norun || regtry(prog, s)) )
1119 goto got_it;
1120 }
1aa99e6b
IH
1121 s += len;
1122 }
09091399 1123 }
1aa99e6b
IH
1124 }
1125 else {
1126 if (c1 == c2)
1127 while (s <= e) {
1128 if ( *(U8*)s == c1
1129 && (ln == 1 || !(OP(c) == EXACTF
1130 ? ibcmp(s, m, ln)
1131 : ibcmp_locale(s, m, ln)))
1132 && (norun || regtry(prog, s)) )
1133 goto got_it;
1134 s++;
1135 }
1136 else
1137 while (s <= e) {
1138 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1139 && (ln == 1 || !(OP(c) == EXACTF
1140 ? ibcmp(s, m, ln)
1141 : ibcmp_locale(s, m, ln)))
1142 && (norun || regtry(prog, s)) )
1143 goto got_it;
1144 s++;
1145 }
b3c9acc1
IZ
1146 }
1147 break;
bbce6d69 1148 case BOUNDL:
3280af22 1149 PL_reg_flags |= RF_tainted;
bbce6d69 1150 /* FALL THROUGH */
a0d0e21e 1151 case BOUND:
ffc61ed2 1152 if (do_utf8) {
12d33761 1153 if (s == PL_bostr)
ffc61ed2
JH
1154 tmp = '\n';
1155 else {
b4f7163a 1156 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1157
b4f7163a 1158 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1159 }
1160 tmp = ((OP(c) == BOUND ?
9041c2e3 1161 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1162 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1163 while (s < strend) {
1164 if (tmp == !(OP(c) == BOUND ?
3568d838 1165 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1166 isALNUM_LC_utf8((U8*)s)))
1167 {
1168 tmp = !tmp;
1169 if ((norun || regtry(prog, s)))
1170 goto got_it;
1171 }
1172 s += UTF8SKIP(s);
a687059c 1173 }
a0d0e21e 1174 }
667bb95a 1175 else {
12d33761 1176 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1177 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1178 while (s < strend) {
1179 if (tmp ==
1180 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1181 tmp = !tmp;
1182 if ((norun || regtry(prog, s)))
1183 goto got_it;
1184 }
1185 s++;
a0ed51b3 1186 }
a0ed51b3 1187 }
6eb5f6b9 1188 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1189 goto got_it;
1190 break;
bbce6d69 1191 case NBOUNDL:
3280af22 1192 PL_reg_flags |= RF_tainted;
bbce6d69 1193 /* FALL THROUGH */
a0d0e21e 1194 case NBOUND:
ffc61ed2 1195 if (do_utf8) {
12d33761 1196 if (s == PL_bostr)
ffc61ed2
JH
1197 tmp = '\n';
1198 else {
b4f7163a 1199 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1200
b4f7163a 1201 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1202 }
1203 tmp = ((OP(c) == NBOUND ?
9041c2e3 1204 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1205 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1206 while (s < strend) {
1207 if (tmp == !(OP(c) == NBOUND ?
3568d838 1208 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1209 isALNUM_LC_utf8((U8*)s)))
1210 tmp = !tmp;
1211 else if ((norun || regtry(prog, s)))
1212 goto got_it;
1213 s += UTF8SKIP(s);
1214 }
a0d0e21e 1215 }
667bb95a 1216 else {
12d33761 1217 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1218 tmp = ((OP(c) == NBOUND ?
1219 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1220 while (s < strend) {
1221 if (tmp ==
1222 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1223 tmp = !tmp;
1224 else if ((norun || regtry(prog, s)))
1225 goto got_it;
1226 s++;
1227 }
a0ed51b3 1228 }
6eb5f6b9 1229 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1230 goto got_it;
1231 break;
a0d0e21e 1232 case ALNUM:
ffc61ed2 1233 if (do_utf8) {
8269fa76 1234 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1235 while (s < strend) {
3568d838 1236 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1237 if (tmp && (norun || regtry(prog, s)))
1238 goto got_it;
1239 else
1240 tmp = doevery;
1241 }
bbce6d69 1242 else
ffc61ed2
JH
1243 tmp = 1;
1244 s += UTF8SKIP(s);
bbce6d69 1245 }
bbce6d69 1246 }
ffc61ed2
JH
1247 else {
1248 while (s < strend) {
1249 if (isALNUM(*s)) {
1250 if (tmp && (norun || regtry(prog, s)))
1251 goto got_it;
1252 else
1253 tmp = doevery;
1254 }
a0ed51b3 1255 else
ffc61ed2
JH
1256 tmp = 1;
1257 s++;
a0ed51b3 1258 }
a0ed51b3
LW
1259 }
1260 break;
bbce6d69 1261 case ALNUML:
3280af22 1262 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1263 if (do_utf8) {
1264 while (s < strend) {
1265 if (isALNUM_LC_utf8((U8*)s)) {
1266 if (tmp && (norun || regtry(prog, s)))
1267 goto got_it;
1268 else
1269 tmp = doevery;
1270 }
a687059c 1271 else
ffc61ed2
JH
1272 tmp = 1;
1273 s += UTF8SKIP(s);
a0d0e21e 1274 }
a0d0e21e 1275 }
ffc61ed2
JH
1276 else {
1277 while (s < strend) {
1278 if (isALNUM_LC(*s)) {
1279 if (tmp && (norun || regtry(prog, s)))
1280 goto got_it;
1281 else
1282 tmp = doevery;
1283 }
a0ed51b3 1284 else
ffc61ed2
JH
1285 tmp = 1;
1286 s++;
a0ed51b3 1287 }
a0ed51b3
LW
1288 }
1289 break;
a0d0e21e 1290 case NALNUM:
ffc61ed2 1291 if (do_utf8) {
8269fa76 1292 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1293 while (s < strend) {
3568d838 1294 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1295 if (tmp && (norun || regtry(prog, s)))
1296 goto got_it;
1297 else
1298 tmp = doevery;
1299 }
bbce6d69 1300 else
ffc61ed2
JH
1301 tmp = 1;
1302 s += UTF8SKIP(s);
bbce6d69 1303 }
bbce6d69 1304 }
ffc61ed2
JH
1305 else {
1306 while (s < strend) {
1307 if (!isALNUM(*s)) {
1308 if (tmp && (norun || regtry(prog, s)))
1309 goto got_it;
1310 else
1311 tmp = doevery;
1312 }
a0ed51b3 1313 else
ffc61ed2
JH
1314 tmp = 1;
1315 s++;
a0ed51b3 1316 }
a0ed51b3
LW
1317 }
1318 break;
bbce6d69 1319 case NALNUML:
3280af22 1320 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1321 if (do_utf8) {
1322 while (s < strend) {
1323 if (!isALNUM_LC_utf8((U8*)s)) {
1324 if (tmp && (norun || regtry(prog, s)))
1325 goto got_it;
1326 else
1327 tmp = doevery;
1328 }
a687059c 1329 else
ffc61ed2
JH
1330 tmp = 1;
1331 s += UTF8SKIP(s);
a687059c 1332 }
a0d0e21e 1333 }
ffc61ed2
JH
1334 else {
1335 while (s < strend) {
1336 if (!isALNUM_LC(*s)) {
1337 if (tmp && (norun || regtry(prog, s)))
1338 goto got_it;
1339 else
1340 tmp = doevery;
1341 }
a0ed51b3 1342 else
ffc61ed2
JH
1343 tmp = 1;
1344 s++;
a0ed51b3 1345 }
a0ed51b3
LW
1346 }
1347 break;
a0d0e21e 1348 case SPACE:
ffc61ed2 1349 if (do_utf8) {
8269fa76 1350 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1351 while (s < strend) {
3568d838 1352 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1353 if (tmp && (norun || regtry(prog, s)))
1354 goto got_it;
1355 else
1356 tmp = doevery;
1357 }
a0d0e21e 1358 else
ffc61ed2
JH
1359 tmp = 1;
1360 s += UTF8SKIP(s);
2304df62 1361 }
a0d0e21e 1362 }
ffc61ed2
JH
1363 else {
1364 while (s < strend) {
1365 if (isSPACE(*s)) {
1366 if (tmp && (norun || regtry(prog, s)))
1367 goto got_it;
1368 else
1369 tmp = doevery;
1370 }
a0ed51b3 1371 else
ffc61ed2
JH
1372 tmp = 1;
1373 s++;
a0ed51b3 1374 }
a0ed51b3
LW
1375 }
1376 break;
bbce6d69 1377 case SPACEL:
3280af22 1378 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1379 if (do_utf8) {
1380 while (s < strend) {
1381 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1382 if (tmp && (norun || regtry(prog, s)))
1383 goto got_it;
1384 else
1385 tmp = doevery;
1386 }
bbce6d69 1387 else
ffc61ed2
JH
1388 tmp = 1;
1389 s += UTF8SKIP(s);
bbce6d69 1390 }
bbce6d69 1391 }
ffc61ed2
JH
1392 else {
1393 while (s < strend) {
1394 if (isSPACE_LC(*s)) {
1395 if (tmp && (norun || regtry(prog, s)))
1396 goto got_it;
1397 else
1398 tmp = doevery;
1399 }
a0ed51b3 1400 else
ffc61ed2
JH
1401 tmp = 1;
1402 s++;
a0ed51b3 1403 }
a0ed51b3
LW
1404 }
1405 break;
a0d0e21e 1406 case NSPACE:
ffc61ed2 1407 if (do_utf8) {
8269fa76 1408 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1409 while (s < strend) {
3568d838 1410 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1411 if (tmp && (norun || regtry(prog, s)))
1412 goto got_it;
1413 else
1414 tmp = doevery;
1415 }
a0d0e21e 1416 else
ffc61ed2
JH
1417 tmp = 1;
1418 s += UTF8SKIP(s);
a687059c 1419 }
a0d0e21e 1420 }
ffc61ed2
JH
1421 else {
1422 while (s < strend) {
1423 if (!isSPACE(*s)) {
1424 if (tmp && (norun || regtry(prog, s)))
1425 goto got_it;
1426 else
1427 tmp = doevery;
1428 }
a0ed51b3 1429 else
ffc61ed2
JH
1430 tmp = 1;
1431 s++;
a0ed51b3 1432 }
a0ed51b3
LW
1433 }
1434 break;
bbce6d69 1435 case NSPACEL:
3280af22 1436 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1437 if (do_utf8) {
1438 while (s < strend) {
1439 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1440 if (tmp && (norun || regtry(prog, s)))
1441 goto got_it;
1442 else
1443 tmp = doevery;
1444 }
bbce6d69 1445 else
ffc61ed2
JH
1446 tmp = 1;
1447 s += UTF8SKIP(s);
bbce6d69 1448 }
bbce6d69 1449 }
ffc61ed2
JH
1450 else {
1451 while (s < strend) {
1452 if (!isSPACE_LC(*s)) {
1453 if (tmp && (norun || regtry(prog, s)))
1454 goto got_it;
1455 else
1456 tmp = doevery;
1457 }
a0ed51b3 1458 else
ffc61ed2
JH
1459 tmp = 1;
1460 s++;
a0ed51b3 1461 }
a0ed51b3
LW
1462 }
1463 break;
a0d0e21e 1464 case DIGIT:
ffc61ed2 1465 if (do_utf8) {
8269fa76 1466 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1467 while (s < strend) {
3568d838 1468 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1469 if (tmp && (norun || regtry(prog, s)))
1470 goto got_it;
1471 else
1472 tmp = doevery;
1473 }
a0d0e21e 1474 else
ffc61ed2
JH
1475 tmp = 1;
1476 s += UTF8SKIP(s);
2b69d0c2 1477 }
a0d0e21e 1478 }
ffc61ed2
JH
1479 else {
1480 while (s < strend) {
1481 if (isDIGIT(*s)) {
1482 if (tmp && (norun || regtry(prog, s)))
1483 goto got_it;
1484 else
1485 tmp = doevery;
1486 }
a0ed51b3 1487 else
ffc61ed2
JH
1488 tmp = 1;
1489 s++;
a0ed51b3 1490 }
a0ed51b3
LW
1491 }
1492 break;
b8c5462f
JH
1493 case DIGITL:
1494 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1495 if (do_utf8) {
1496 while (s < strend) {
1497 if (isDIGIT_LC_utf8((U8*)s)) {
1498 if (tmp && (norun || regtry(prog, s)))
1499 goto got_it;
1500 else
1501 tmp = doevery;
1502 }
b8c5462f 1503 else
ffc61ed2
JH
1504 tmp = 1;
1505 s += UTF8SKIP(s);
b8c5462f 1506 }
b8c5462f 1507 }
ffc61ed2
JH
1508 else {
1509 while (s < strend) {
1510 if (isDIGIT_LC(*s)) {
1511 if (tmp && (norun || regtry(prog, s)))
1512 goto got_it;
1513 else
1514 tmp = doevery;
1515 }
b8c5462f 1516 else
ffc61ed2
JH
1517 tmp = 1;
1518 s++;
b8c5462f 1519 }
b8c5462f
JH
1520 }
1521 break;
a0d0e21e 1522 case NDIGIT:
ffc61ed2 1523 if (do_utf8) {
8269fa76 1524 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1525 while (s < strend) {
3568d838 1526 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1527 if (tmp && (norun || regtry(prog, s)))
1528 goto got_it;
1529 else
1530 tmp = doevery;
1531 }
a0d0e21e 1532 else
ffc61ed2
JH
1533 tmp = 1;
1534 s += UTF8SKIP(s);
a687059c 1535 }
a0d0e21e 1536 }
ffc61ed2
JH
1537 else {
1538 while (s < strend) {
1539 if (!isDIGIT(*s)) {
1540 if (tmp && (norun || regtry(prog, s)))
1541 goto got_it;
1542 else
1543 tmp = doevery;
1544 }
a0ed51b3 1545 else
ffc61ed2
JH
1546 tmp = 1;
1547 s++;
a0ed51b3 1548 }
a0ed51b3
LW
1549 }
1550 break;
b8c5462f
JH
1551 case NDIGITL:
1552 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1553 if (do_utf8) {
1554 while (s < strend) {
1555 if (!isDIGIT_LC_utf8((U8*)s)) {
1556 if (tmp && (norun || regtry(prog, s)))
1557 goto got_it;
1558 else
1559 tmp = doevery;
1560 }
b8c5462f 1561 else
ffc61ed2
JH
1562 tmp = 1;
1563 s += UTF8SKIP(s);
b8c5462f 1564 }
a0ed51b3 1565 }
ffc61ed2
JH
1566 else {
1567 while (s < strend) {
1568 if (!isDIGIT_LC(*s)) {
1569 if (tmp && (norun || regtry(prog, s)))
1570 goto got_it;
1571 else
1572 tmp = doevery;
1573 }
cf93c79d 1574 else
ffc61ed2
JH
1575 tmp = 1;
1576 s++;
b8c5462f 1577 }
b8c5462f
JH
1578 }
1579 break;
b3c9acc1 1580 default:
3c3eec57
GS
1581 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1582 break;
d6a28714 1583 }
6eb5f6b9
JH
1584 return 0;
1585 got_it:
1586 return s;
1587}
1588
1589/*
1590 - regexec_flags - match a regexp against a string
1591 */
1592I32
1593Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1594 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1595/* strend: pointer to null at end of string */
1596/* strbeg: real beginning of string */
1597/* minend: end of match must be >=minend after stringarg. */
1598/* data: May be used for some additional optimizations. */
1599/* nosave: For optimizations. */
1600{
6eb5f6b9
JH
1601 register char *s;
1602 register regnode *c;
1603 register char *startpos = stringarg;
6eb5f6b9
JH
1604 I32 minlen; /* must match at least this many chars */
1605 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1606 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1607 constant substr. */ /* CC */
1608 I32 end_shift = 0; /* Same for the end. */ /* CC */
1609 I32 scream_pos = -1; /* Internal iterator of scream. */
1610 char *scream_olds;
1611 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1612 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1613#ifdef DEBUGGING
9e55ce06
JH
1614 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1615 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1616#endif
a30b2f1f 1617 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1618
1619 PL_regcc = 0;
1620
1621 cache_re(prog);
1622#ifdef DEBUGGING
aea4f609 1623 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1624#endif
1625
1626 /* Be paranoid... */
1627 if (prog == NULL || startpos == NULL) {
1628 Perl_croak(aTHX_ "NULL regexp parameter");
1629 return 0;
1630 }
1631
1632 minlen = prog->minlen;
61a36c01 1633 if (strend - startpos < minlen) {
a72c7584
JH
1634 DEBUG_r(PerlIO_printf(Perl_debug_log,
1635 "String too short [regexec_flags]...\n"));
1636 goto phooey;
1aa99e6b 1637 }
6eb5f6b9 1638
6eb5f6b9
JH
1639 /* Check validity of program. */
1640 if (UCHARAT(prog->program) != REG_MAGIC) {
1641 Perl_croak(aTHX_ "corrupted regexp program");
1642 }
1643
1644 PL_reg_flags = 0;
1645 PL_reg_eval_set = 0;
1646 PL_reg_maxiter = 0;
1647
1648 if (prog->reganch & ROPT_UTF8)
1649 PL_reg_flags |= RF_utf8;
1650
1651 /* Mark beginning of line for ^ and lookbehind. */
1652 PL_regbol = startpos;
1653 PL_bostr = strbeg;
1654 PL_reg_sv = sv;
1655
1656 /* Mark end of line for $ (and such) */
1657 PL_regeol = strend;
1658
1659 /* see how far we have to get to not match where we matched before */
1660 PL_regtill = startpos+minend;
1661
1662 /* We start without call_cc context. */
1663 PL_reg_call_cc = 0;
1664
1665 /* If there is a "must appear" string, look for it. */
1666 s = startpos;
1667
1668 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1669 MAGIC *mg;
1670
1671 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1672 PL_reg_ganch = startpos;
1673 else if (sv && SvTYPE(sv) >= SVt_PVMG
1674 && SvMAGIC(sv)
14befaf4
DM
1675 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1676 && mg->mg_len >= 0) {
6eb5f6b9
JH
1677 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1678 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1679 if (s > PL_reg_ganch)
6eb5f6b9
JH
1680 goto phooey;
1681 s = PL_reg_ganch;
1682 }
1683 }
1684 else /* pos() not defined */
1685 PL_reg_ganch = strbeg;
1686 }
1687
33b8afdf 1688 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1689 re_scream_pos_data d;
1690
1691 d.scream_olds = &scream_olds;
1692 d.scream_pos = &scream_pos;
1693 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1694 if (!s) {
1695 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1696 goto phooey; /* not present */
3fa9c3d7 1697 }
6eb5f6b9
JH
1698 }
1699
2a782b5b 1700 DEBUG_r({
9e55ce06
JH
1701 char *s0 = UTF ?
1702 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1703 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1704 prog->precomp;
1705 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1706 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1707 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1708 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1709 if (!PL_colorset)
1710 reginitcolors();
1711 PerlIO_printf(Perl_debug_log,
9e55ce06 1712 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1713 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1714 len0, len0, s0,
2a782b5b 1715 PL_colors[1],
9e55ce06 1716 len0 > 60 ? "..." : "",
2a782b5b 1717 PL_colors[0],
9e55ce06
JH
1718 (int)(len1 > 60 ? 60 : len1),
1719 s1, PL_colors[1],
1720 (len1 > 60 ? "..." : "")
2a782b5b
JH
1721 );
1722 });
6eb5f6b9
JH
1723
1724 /* Simplest case: anchored match need be tried only once. */
1725 /* [unless only anchor is BOL and multiline is set] */
1726 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1727 if (s == startpos && regtry(prog, startpos))
1728 goto got_it;
1729 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1730 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1731 {
1732 char *end;
1733
1734 if (minlen)
1735 dontbother = minlen - 1;
1aa99e6b 1736 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1737 /* for multiline we only have to try after newlines */
33b8afdf 1738 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1739 if (s == startpos)
1740 goto after_try;
1741 while (1) {
1742 if (regtry(prog, s))
1743 goto got_it;
1744 after_try:
1745 if (s >= end)
1746 goto phooey;
1747 if (prog->reganch & RE_USE_INTUIT) {
1748 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1749 if (!s)
1750 goto phooey;
1751 }
1752 else
1753 s++;
1754 }
1755 } else {
1756 if (s > startpos)
1757 s--;
1758 while (s < end) {
1759 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1760 if (regtry(prog, s))
1761 goto got_it;
1762 }
1763 }
1764 }
1765 }
1766 goto phooey;
1767 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1768 if (regtry(prog, PL_reg_ganch))
1769 goto got_it;
1770 goto phooey;
1771 }
1772
1773 /* Messy cases: unanchored match. */
33b8afdf 1774 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1775 /* we have /x+whatever/ */
1776 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1777 char ch;
bf93d4cc
GS
1778#ifdef DEBUGGING
1779 int did_match = 0;
1780#endif
33b8afdf
JH
1781 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1782 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1783 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1784
1aa99e6b 1785 if (do_utf8) {
6eb5f6b9
JH
1786 while (s < strend) {
1787 if (*s == ch) {
bf93d4cc 1788 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1789 if (regtry(prog, s)) goto got_it;
1790 s += UTF8SKIP(s);
1791 while (s < strend && *s == ch)
1792 s += UTF8SKIP(s);
1793 }
1794 s += UTF8SKIP(s);
1795 }
1796 }
1797 else {
1798 while (s < strend) {
1799 if (*s == ch) {
bf93d4cc 1800 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1801 if (regtry(prog, s)) goto got_it;
1802 s++;
1803 while (s < strend && *s == ch)
1804 s++;
1805 }
1806 s++;
1807 }
1808 }
b7953727 1809 DEBUG_r(if (!did_match)
bf93d4cc 1810 PerlIO_printf(Perl_debug_log,
b7953727
JH
1811 "Did not find anchored character...\n")
1812 );
6eb5f6b9
JH
1813 }
1814 /*SUPPRESS 560*/
33b8afdf
JH
1815 else if (prog->anchored_substr != Nullsv
1816 || prog->anchored_utf8 != Nullsv
1817 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1818 && prog->float_max_offset < strend - s)) {
1819 SV *must;
1820 I32 back_max;
1821 I32 back_min;
1822 char *last;
6eb5f6b9 1823 char *last1; /* Last position checked before */
bf93d4cc
GS
1824#ifdef DEBUGGING
1825 int did_match = 0;
1826#endif
33b8afdf
JH
1827 if (prog->anchored_substr || prog->anchored_utf8) {
1828 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1829 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1830 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1831 back_max = back_min = prog->anchored_offset;
1832 } else {
1833 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1834 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1835 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1836 back_max = prog->float_max_offset;
1837 back_min = prog->float_min_offset;
1838 }
1839 if (must == &PL_sv_undef)
1840 /* could not downgrade utf8 check substring, so must fail */
1841 goto phooey;
1842
1843 last = HOP3c(strend, /* Cannot start after this */
1844 -(I32)(CHR_SVLEN(must)
1845 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1846
1847 if (s > PL_bostr)
1848 last1 = HOPc(s, -1);
1849 else
1850 last1 = s - 1; /* bogus */
1851
1852 /* XXXX check_substr already used to find `s', can optimize if
1853 check_substr==must. */
1854 scream_pos = -1;
1855 dontbother = end_shift;
1856 strend = HOPc(strend, -dontbother);
1857 while ( (s <= last) &&
9041c2e3 1858 ((flags & REXEC_SCREAM)
1aa99e6b 1859 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1860 end_shift, &scream_pos, 0))
1aa99e6b 1861 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1862 (unsigned char*)strend, must,
6eb5f6b9 1863 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1864 /* we may be pointing at the wrong string */
1865 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1866 s = strbeg + (s - SvPVX(sv));
bf93d4cc 1867 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1868 if (HOPc(s, -back_max) > last1) {
1869 last1 = HOPc(s, -back_min);
1870 s = HOPc(s, -back_max);
1871 }
1872 else {
1873 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1874
1875 last1 = HOPc(s, -back_min);
1876 s = t;
1877 }
1aa99e6b 1878 if (do_utf8) {
6eb5f6b9
JH
1879 while (s <= last1) {
1880 if (regtry(prog, s))
1881 goto got_it;
1882 s += UTF8SKIP(s);
1883 }
1884 }
1885 else {
1886 while (s <= last1) {
1887 if (regtry(prog, s))
1888 goto got_it;
1889 s++;
1890 }
1891 }
1892 }
b7953727
JH
1893 DEBUG_r(if (!did_match)
1894 PerlIO_printf(Perl_debug_log,
1895 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1896 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1897 ? "anchored" : "floating"),
1898 PL_colors[0],
1899 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1900 SvPVX(must),
b7953727
JH
1901 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1902 );
6eb5f6b9
JH
1903 goto phooey;
1904 }
155aba94 1905 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1906 if (minlen) {
1907 I32 op = (U8)OP(prog->regstclass);
66e933ab 1908 /* don't bother with what can't match */
f14c76ed
RGS
1909 if (PL_regkind[op] != EXACT && op != CANY)
1910 strend = HOPc(strend, -(minlen - 1));
1911 }
ffc61ed2
JH
1912 DEBUG_r({
1913 SV *prop = sv_newmortal();
9e55ce06
JH
1914 char *s0;
1915 char *s1;
1916 int len0;
1917 int len1;
1918
ffc61ed2 1919 regprop(prop, c);
9e55ce06
JH
1920 s0 = UTF ?
1921 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1922 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1923 SvPVX(prop);
1924 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1925 s1 = UTF ?
c728cb41 1926 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1927 len1 = UTF ? SvCUR(dsv1) : strend - s;
1928 PerlIO_printf(Perl_debug_log,
1929 "Matching stclass `%*.*s' against `%*.*s'\n",
1930 len0, len0, s0,
1931 len1, len1, s1);
ffc61ed2 1932 });
6eb5f6b9
JH
1933 if (find_byclass(prog, c, s, strend, startpos, 0))
1934 goto got_it;
bf93d4cc 1935 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1936 }
1937 else {
1938 dontbother = 0;
33b8afdf
JH
1939 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1940 /* Trim the end. */
d6a28714 1941 char *last;
33b8afdf
JH
1942 SV* float_real;
1943
1944 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1945 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1946 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1947
1948 if (flags & REXEC_SCREAM) {
33b8afdf 1949 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1950 end_shift, &scream_pos, 1); /* last one */
1951 if (!last)
ffc61ed2 1952 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1953 /* we may be pointing at the wrong string */
1954 else if (RX_MATCH_COPIED(prog))
7ef91622 1955 s = strbeg + (s - SvPVX(sv));
b8c5462f 1956 }
d6a28714
JH
1957 else {
1958 STRLEN len;
33b8afdf 1959 char *little = SvPV(float_real, len);
d6a28714 1960
33b8afdf 1961 if (SvTAIL(float_real)) {
d6a28714
JH
1962 if (memEQ(strend - len + 1, little, len - 1))
1963 last = strend - len + 1;
1964 else if (!PL_multiline)
9041c2e3 1965 last = memEQ(strend - len, little, len)
d6a28714 1966 ? strend - len : Nullch;
b8c5462f 1967 else
d6a28714
JH
1968 goto find_last;
1969 } else {
1970 find_last:
9041c2e3 1971 if (len)
d6a28714 1972 last = rninstr(s, strend, little, little + len);
b8c5462f 1973 else
d6a28714 1974 last = strend; /* matching `$' */
b8c5462f 1975 }
b8c5462f 1976 }
bf93d4cc
GS
1977 if (last == NULL) {
1978 DEBUG_r(PerlIO_printf(Perl_debug_log,
1979 "%sCan't trim the tail, match fails (should not happen)%s\n",
1980 PL_colors[4],PL_colors[5]));
1981 goto phooey; /* Should not happen! */
1982 }
d6a28714
JH
1983 dontbother = strend - last + prog->float_min_offset;
1984 }
1985 if (minlen && (dontbother < minlen))
1986 dontbother = minlen - 1;
1987 strend -= dontbother; /* this one's always in bytes! */
1988 /* We don't know much -- general case. */
1aa99e6b 1989 if (do_utf8) {
d6a28714
JH
1990 for (;;) {
1991 if (regtry(prog, s))
1992 goto got_it;
1993 if (s >= strend)
1994 break;
b8c5462f 1995 s += UTF8SKIP(s);
d6a28714
JH
1996 };
1997 }
1998 else {
1999 do {
2000 if (regtry(prog, s))
2001 goto got_it;
2002 } while (s++ < strend);
2003 }
2004 }
2005
2006 /* Failure. */
2007 goto phooey;
2008
2009got_it:
2010 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2011
2012 if (PL_reg_eval_set) {
2013 /* Preserve the current value of $^R */
2014 if (oreplsv != GvSV(PL_replgv))
2015 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2016 restored, the value remains
2017 the same. */
acfe0abc 2018 restore_pos(aTHX_ 0);
d6a28714
JH
2019 }
2020
2021 /* make sure $`, $&, $', and $digit will work later */
2022 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2023 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2024 if (flags & REXEC_COPY_STR) {
2025 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2026#ifdef PERL_COPY_ON_WRITE
2027 if ((SvIsCOW(sv)
2028 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2029 if (DEBUG_C_TEST) {
2030 PerlIO_printf(Perl_debug_log,
2031 "Copy on write: regexp capture, type %d\n",
2032 (int) SvTYPE(sv));
2033 }
2034 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2035 prog->subbeg = SvPVX(prog->saved_copy);
2036 assert (SvPOKp(prog->saved_copy));
2037 } else
2038#endif
2039 {
2040 RX_MATCH_COPIED_on(prog);
2041 s = savepvn(strbeg, i);
2042 prog->subbeg = s;
2043 }
d6a28714 2044 prog->sublen = i;
d6a28714
JH
2045 }
2046 else {
2047 prog->subbeg = strbeg;
2048 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2049 }
2050 }
9041c2e3 2051
d6a28714
JH
2052 return 1;
2053
2054phooey:
bf93d4cc
GS
2055 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2056 PL_colors[4],PL_colors[5]));
d6a28714 2057 if (PL_reg_eval_set)
acfe0abc 2058 restore_pos(aTHX_ 0);
d6a28714
JH
2059 return 0;
2060}
2061
2062/*
2063 - regtry - try match at specific point
2064 */
2065STATIC I32 /* 0 failure, 1 success */
2066S_regtry(pTHX_ regexp *prog, char *startpos)
2067{
d6a28714
JH
2068 register I32 i;
2069 register I32 *sp;
2070 register I32 *ep;
2071 CHECKPOINT lastcp;
2072
02db2b7b
IZ
2073#ifdef DEBUGGING
2074 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2075#endif
d6a28714
JH
2076 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2077 MAGIC *mg;
2078
2079 PL_reg_eval_set = RS_init;
2080 DEBUG_r(DEBUG_s(
b900a521
JH
2081 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2082 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2083 ));
e8347627 2084 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2085 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2086 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2087 SAVETMPS;
2088 /* Apparently this is not needed, judging by wantarray. */
e8347627 2089 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2090 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2091
2092 if (PL_reg_sv) {
2093 /* Make $_ available to executed code. */
2094 if (PL_reg_sv != DEFSV) {
4d1ff10f 2095 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2096 SAVESPTR(DEFSV);
2097 DEFSV = PL_reg_sv;
b8c5462f 2098 }
d6a28714 2099
9041c2e3 2100 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2101 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2102 /* prepare for quick setting of pos */
14befaf4
DM
2103 sv_magic(PL_reg_sv, (SV*)0,
2104 PERL_MAGIC_regex_global, Nullch, 0);
2105 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2106 mg->mg_len = -1;
b8c5462f 2107 }
d6a28714
JH
2108 PL_reg_magic = mg;
2109 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2110 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2111 }
09687e5a 2112 if (!PL_reg_curpm) {
0f79a09d 2113 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2114#ifdef USE_ITHREADS
2115 {
2116 SV* repointer = newSViv(0);
577e12cc 2117 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2118 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2119 av_push(PL_regex_padav,repointer);
2120 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2121 PL_regex_pad = AvARRAY(PL_regex_padav);
2122 }
2123#endif
2124 }
aaa362c4 2125 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2126 PL_reg_oldcurpm = PL_curpm;
2127 PL_curpm = PL_reg_curpm;
2128 if (RX_MATCH_COPIED(prog)) {
2129 /* Here is a serious problem: we cannot rewrite subbeg,
2130 since it may be needed if this match fails. Thus
2131 $` inside (?{}) could fail... */
2132 PL_reg_oldsaved = prog->subbeg;
2133 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2134#ifdef PERL_COPY_ON_WRITE
2135 PL_nrs = prog->saved_copy;
2136#endif
d6a28714
JH
2137 RX_MATCH_COPIED_off(prog);
2138 }
2139 else
2140 PL_reg_oldsaved = Nullch;
2141 prog->subbeg = PL_bostr;
2142 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2143 }
2144 prog->startp[0] = startpos - PL_bostr;
2145 PL_reginput = startpos;
2146 PL_regstartp = prog->startp;
2147 PL_regendp = prog->endp;
2148 PL_reglastparen = &prog->lastparen;
a01268b5 2149 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2150 prog->lastparen = 0;
03994de8 2151 prog->lastcloseparen = 0;
d6a28714
JH
2152 PL_regsize = 0;
2153 DEBUG_r(PL_reg_starttry = startpos);
2154 if (PL_reg_start_tmpl <= prog->nparens) {
2155 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2156 if(PL_reg_start_tmp)
2157 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2158 else
2159 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2160 }
2161
2162 /* XXXX What this code is doing here?!!! There should be no need
2163 to do this again and again, PL_reglastparen should take care of
3dd2943c 2164 this! --ilya*/
dafc8851
JH
2165
2166 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2167 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2168 * PL_reglastparen), is not needed at all by the test suite
2169 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2170 * enough, for building DynaLoader, or otherwise this
2171 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2172 * will happen. Meanwhile, this code *is* needed for the
2173 * above-mentioned test suite tests to succeed. The common theme
2174 * on those tests seems to be returning null fields from matches.
2175 * --jhi */
dafc8851 2176#if 1
d6a28714
JH
2177 sp = prog->startp;
2178 ep = prog->endp;
2179 if (prog->nparens) {
eb160463 2180 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2181 *++sp = -1;
2182 *++ep = -1;
2183 }
2184 }
dafc8851 2185#endif
02db2b7b 2186 REGCP_SET(lastcp);
d6a28714
JH
2187 if (regmatch(prog->program + 1)) {
2188 prog->endp[0] = PL_reginput - PL_bostr;
2189 return 1;
2190 }
02db2b7b 2191 REGCP_UNWIND(lastcp);
d6a28714
JH
2192 return 0;
2193}
2194
02db2b7b
IZ
2195#define RE_UNWIND_BRANCH 1
2196#define RE_UNWIND_BRANCHJ 2
2197
2198union re_unwind_t;
2199
2200typedef struct { /* XX: makes sense to enlarge it... */
2201 I32 type;
2202 I32 prev;
2203 CHECKPOINT lastcp;
2204} re_unwind_generic_t;
2205
2206typedef struct {
2207 I32 type;
2208 I32 prev;
2209 CHECKPOINT lastcp;
2210 I32 lastparen;
2211 regnode *next;
2212 char *locinput;
2213 I32 nextchr;
2214#ifdef DEBUGGING
2215 int regindent;
2216#endif
2217} re_unwind_branch_t;
2218
2219typedef union re_unwind_t {
2220 I32 type;
2221 re_unwind_generic_t generic;
2222 re_unwind_branch_t branch;
2223} re_unwind_t;
2224
8ba1375e
MJD
2225#define sayYES goto yes
2226#define sayNO goto no
e0f9d4a8 2227#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2228#define sayYES_FINAL goto yes_final
2229#define sayYES_LOUD goto yes_loud
2230#define sayNO_FINAL goto no_final
2231#define sayNO_SILENT goto do_no
2232#define saySAME(x) if (x) goto yes; else goto no
2233
2234#define REPORT_CODE_OFF 24
2235
d6a28714
JH
2236/*
2237 - regmatch - main matching routine
2238 *
2239 * Conceptually the strategy is simple: check to see whether the current
2240 * node matches, call self recursively to see whether the rest matches,
2241 * and then act accordingly. In practice we make some effort to avoid
2242 * recursion, in particular by going through "ordinary" nodes (that don't
2243 * need to know whether the rest of the match failed) by a loop instead of
2244 * by recursion.
2245 */
2246/* [lwall] I've hoisted the register declarations to the outer block in order to
2247 * maybe save a little bit of pushing and popping on the stack. It also takes
2248 * advantage of machines that use a register save mask on subroutine entry.
2249 */
2250STATIC I32 /* 0 failure, 1 success */
2251S_regmatch(pTHX_ regnode *prog)
2252{
d6a28714
JH
2253 register regnode *scan; /* Current node. */
2254 regnode *next; /* Next node. */
2255 regnode *inner; /* Next node in internal branch. */
2256 register I32 nextchr; /* renamed nextchr - nextchar colides with
2257 function of same name */
2258 register I32 n; /* no or next */
b7953727
JH
2259 register I32 ln = 0; /* len or last */
2260 register char *s = Nullch; /* operand or save */
d6a28714 2261 register char *locinput = PL_reginput;
b7953727 2262 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2263 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2264 I32 unwind = 0;
b7953727 2265#if 0
02db2b7b 2266 I32 firstcp = PL_savestack_ix;
b7953727 2267#endif
53c4c00c 2268 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2269#ifdef DEBUGGING
ce333219
JH
2270 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2271 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2272 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2273#endif
02db2b7b 2274
d6a28714
JH
2275#ifdef DEBUGGING
2276 PL_regindent++;
2277#endif
2278
2279 /* Note that nextchr is a byte even in UTF */
2280 nextchr = UCHARAT(locinput);
2281 scan = prog;
2282 while (scan != NULL) {
8ba1375e 2283
2a782b5b 2284 DEBUG_r( {
d6a28714
JH
2285 SV *prop = sv_newmortal();
2286 int docolor = *PL_colors[0];
2287 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2288 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2289 /* The part of the string before starttry has one color
2290 (pref0_len chars), between starttry and current
2291 position another one (pref_len - pref0_len chars),
2292 after the current position the third one.
2293 We assume that pref0_len <= pref_len, otherwise we
2294 decrease pref0_len. */
9041c2e3 2295 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2296 ? (5 + taill) - l : locinput - PL_bostr;
2297 int pref0_len;
d6a28714 2298
df1ffd02 2299 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2300 pref_len++;
2301 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2302 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2303 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2304 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2305 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2306 l--;
d6a28714
JH
2307 if (pref0_len < 0)
2308 pref0_len = 0;
2309 if (pref0_len > pref_len)
2310 pref0_len = pref_len;
2311 regprop(prop, scan);
2a782b5b
JH
2312 {
2313 char *s0 =
f14c76ed 2314 do_utf8 && OP(scan) != CANY ?
2a782b5b 2315 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2316 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2317 locinput - pref_len;
df1ffd02 2318 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2319 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2320 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2321 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2322 locinput - pref_len + pref0_len;
df1ffd02 2323 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2324 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2325 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2326 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2327 locinput;
df1ffd02 2328 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2329 PerlIO_printf(Perl_debug_log,
2330 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2331 (IV)(locinput - PL_bostr),
2332 PL_colors[4],
2333 len0, s0,
2334 PL_colors[5],
2335 PL_colors[2],
2336 len1, s1,
2337 PL_colors[3],
2338 (docolor ? "" : "> <"),
2339 PL_colors[0],
2340 len2, s2,
2341 PL_colors[1],
2342 15 - l - pref_len + 1,
2343 "",
2344 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2345 SvPVX(prop));
2346 }
2347 });
d6a28714
JH
2348
2349 next = scan + NEXT_OFF(scan);
2350 if (next == scan)
2351 next = NULL;
2352
2353 switch (OP(scan)) {
2354 case BOL:
12d33761
HS
2355 if (locinput == PL_bostr || (PL_multiline &&
2356 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2357 {
2358 /* regtill = regbol; */
b8c5462f
JH
2359 break;
2360 }
d6a28714
JH
2361 sayNO;
2362 case MBOL:
12d33761
HS
2363 if (locinput == PL_bostr ||
2364 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2365 {
b8c5462f
JH
2366 break;
2367 }
d6a28714
JH
2368 sayNO;
2369 case SBOL:
c2a73568 2370 if (locinput == PL_bostr)
b8c5462f 2371 break;
d6a28714
JH
2372 sayNO;
2373 case GPOS:
2374 if (locinput == PL_reg_ganch)
2375 break;
2376 sayNO;
2377 case EOL:
2378 if (PL_multiline)
2379 goto meol;
2380 else
2381 goto seol;
2382 case MEOL:
2383 meol:
2384 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2385 sayNO;
b8c5462f 2386 break;
d6a28714
JH
2387 case SEOL:
2388 seol:
2389 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2390 sayNO;
d6a28714 2391 if (PL_regeol - locinput > 1)
b8c5462f 2392 sayNO;
b8c5462f 2393 break;
d6a28714
JH
2394 case EOS:
2395 if (PL_regeol != locinput)
b8c5462f 2396 sayNO;
d6a28714 2397 break;
ffc61ed2 2398 case SANY:
d6a28714 2399 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2400 sayNO;
f33976b4
DB
2401 if (do_utf8) {
2402 locinput += PL_utf8skip[nextchr];
2403 if (locinput > PL_regeol)
2404 sayNO;
2405 nextchr = UCHARAT(locinput);
2406 }
2407 else
2408 nextchr = UCHARAT(++locinput);
2409 break;
2410 case CANY:
2411 if (!nextchr && locinput >= PL_regeol)
2412 sayNO;
b8c5462f 2413 nextchr = UCHARAT(++locinput);
a0d0e21e 2414 break;
ffc61ed2 2415 case REG_ANY:
1aa99e6b
IH
2416 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2417 sayNO;
2418 if (do_utf8) {
b8c5462f 2419 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2420 if (locinput > PL_regeol)
2421 sayNO;
a0ed51b3 2422 nextchr = UCHARAT(locinput);
a0ed51b3 2423 }
1aa99e6b
IH
2424 else
2425 nextchr = UCHARAT(++locinput);
a0ed51b3 2426 break;
d6a28714 2427 case EXACT:
cd439c50
IZ
2428 s = STRING(scan);
2429 ln = STR_LEN(scan);
eb160463 2430 if (do_utf8 != UTF) {
bc517b45 2431 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2432 char *l = locinput;
2433 char *e = s + ln;
bc517b45 2434 STRLEN ulen;
a72c7584 2435
5ff6fc6d
JH
2436 if (do_utf8) {
2437 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2438 while (s < e) {
2439 if (l >= PL_regeol)
5ff6fc6d
JH
2440 sayNO;
2441 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2442 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2443 ckWARN(WARN_UTF8) ?
2444 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2445 sayNO;
bc517b45 2446 l += ulen;
5ff6fc6d 2447 s ++;
1aa99e6b 2448 }
5ff6fc6d
JH
2449 }
2450 else {
2451 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2452 while (s < e) {
2453 if (l >= PL_regeol)
2454 sayNO;
5ff6fc6d 2455 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2456 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2457 ckWARN(WARN_UTF8) ?
2458 0 : UTF8_ALLOW_ANY))
1aa99e6b 2459 sayNO;
bc517b45 2460 s += ulen;
a72c7584 2461 l ++;
1aa99e6b 2462 }
5ff6fc6d 2463 }
1aa99e6b
IH
2464 locinput = l;
2465 nextchr = UCHARAT(locinput);
2466 break;
2467 }
bc517b45 2468 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2469 /* Inline the first character, for speed. */
2470 if (UCHARAT(s) != nextchr)
2471 sayNO;
2472 if (PL_regeol - locinput < ln)
2473 sayNO;
2474 if (ln > 1 && memNE(s, locinput, ln))
2475 sayNO;
2476 locinput += ln;
2477 nextchr = UCHARAT(locinput);
2478 break;
2479 case EXACTFL:
b8c5462f
JH
2480 PL_reg_flags |= RF_tainted;
2481 /* FALL THROUGH */
d6a28714 2482 case EXACTF:
cd439c50
IZ
2483 s = STRING(scan);
2484 ln = STR_LEN(scan);
d6a28714 2485
d07ddd77
JH
2486 if (do_utf8 || UTF) {
2487 /* Either target or the pattern are utf8. */
d6a28714 2488 char *l = locinput;
d07ddd77 2489 char *e = PL_regeol;
bc517b45 2490
eb160463 2491 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2492 l, &e, 0, do_utf8)) {
5486206c
JH
2493 /* One more case for the sharp s:
2494 * pack("U0U*", 0xDF) =~ /ss/i,
2495 * the 0xC3 0x9F are the UTF-8
2496 * byte sequence for the U+00DF. */
2497 if (!(do_utf8 &&
2498 toLOWER(s[0]) == 's' &&
2499 ln >= 2 &&
2500 toLOWER(s[1]) == 's' &&
2501 (U8)l[0] == 0xC3 &&
2502 e - l >= 2 &&
2503 (U8)l[1] == 0x9F))
2504 sayNO;
2505 }
d07ddd77
JH
2506 locinput = e;
2507 nextchr = UCHARAT(locinput);
2508 break;
a0ed51b3 2509 }
d6a28714 2510
bc517b45
JH
2511 /* Neither the target and the pattern are utf8. */
2512
d6a28714
JH
2513 /* Inline the first character, for speed. */
2514 if (UCHARAT(s) != nextchr &&
2515 UCHARAT(s) != ((OP(scan) == EXACTF)
2516 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2517 sayNO;
d6a28714 2518 if (PL_regeol - locinput < ln)
b8c5462f 2519 sayNO;
d6a28714
JH
2520 if (ln > 1 && (OP(scan) == EXACTF
2521 ? ibcmp(s, locinput, ln)
2522 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2523 sayNO;
d6a28714
JH
2524 locinput += ln;
2525 nextchr = UCHARAT(locinput);
a0d0e21e 2526 break;
d6a28714 2527 case ANYOF:
ffc61ed2 2528 if (do_utf8) {
9e55ce06
JH
2529 STRLEN inclasslen = PL_regeol - locinput;
2530
ba7b4546 2531 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2532 sayNO_ANYOF;
ffc61ed2
JH
2533 if (locinput >= PL_regeol)
2534 sayNO;
0f0076b4 2535 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2536 nextchr = UCHARAT(locinput);
e0f9d4a8 2537 break;
ffc61ed2
JH
2538 }
2539 else {
2540 if (nextchr < 0)
2541 nextchr = UCHARAT(locinput);
7d3e948e 2542 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2543 sayNO_ANYOF;
ffc61ed2
JH
2544 if (!nextchr && locinput >= PL_regeol)
2545 sayNO;
2546 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2547 break;
2548 }
2549 no_anyof:
2550 /* If we might have the case of the German sharp s
2551 * in a casefolding Unicode character class. */
2552
ebc501f0
JH
2553 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2554 locinput += SHARP_S_SKIP;
e0f9d4a8 2555 nextchr = UCHARAT(locinput);
ffc61ed2 2556 }
e0f9d4a8
JH
2557 else
2558 sayNO;
b8c5462f 2559 break;
d6a28714 2560 case ALNUML:
b8c5462f
JH
2561 PL_reg_flags |= RF_tainted;
2562 /* FALL THROUGH */
d6a28714 2563 case ALNUM:
b8c5462f 2564 if (!nextchr)
4633a7c4 2565 sayNO;
ffc61ed2 2566 if (do_utf8) {
ad24be35 2567 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2568 if (!(OP(scan) == ALNUM
3568d838 2569 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2570 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2571 {
2572 sayNO;
a0ed51b3 2573 }
b8c5462f 2574 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2575 nextchr = UCHARAT(locinput);
2576 break;
2577 }
ffc61ed2 2578 if (!(OP(scan) == ALNUM
d6a28714 2579 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2580 sayNO;
b8c5462f 2581 nextchr = UCHARAT(++locinput);
a0d0e21e 2582 break;
d6a28714 2583 case NALNUML:
b8c5462f
JH
2584 PL_reg_flags |= RF_tainted;
2585 /* FALL THROUGH */
d6a28714
JH
2586 case NALNUM:
2587 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2588 sayNO;
ffc61ed2 2589 if (do_utf8) {
8269fa76 2590 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2591 if (OP(scan) == NALNUM
3568d838 2592 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2593 : isALNUM_LC_utf8((U8*)locinput))
2594 {
b8c5462f 2595 sayNO;
d6a28714 2596 }
b8c5462f
JH
2597 locinput += PL_utf8skip[nextchr];
2598 nextchr = UCHARAT(locinput);
2599 break;
2600 }
ffc61ed2 2601 if (OP(scan) == NALNUM
d6a28714 2602 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2603 sayNO;
76e3520e 2604 nextchr = UCHARAT(++locinput);
a0d0e21e 2605 break;
d6a28714
JH
2606 case BOUNDL:
2607 case NBOUNDL:
3280af22 2608 PL_reg_flags |= RF_tainted;
bbce6d69 2609 /* FALL THROUGH */
d6a28714
JH
2610 case BOUND:
2611 case NBOUND:
2612 /* was last char in word? */
ffc61ed2 2613 if (do_utf8) {
12d33761
HS
2614 if (locinput == PL_bostr)
2615 ln = '\n';
ffc61ed2 2616 else {
b4f7163a 2617 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2618
b4f7163a 2619 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2620 }
2621 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2622 ln = isALNUM_uni(ln);
8269fa76 2623 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2624 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2625 }
2626 else {
9041c2e3 2627 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2628 n = isALNUM_LC_utf8((U8*)locinput);
2629 }
a0ed51b3 2630 }
d6a28714 2631 else {
12d33761
HS
2632 ln = (locinput != PL_bostr) ?
2633 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2634 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2635 ln = isALNUM(ln);
2636 n = isALNUM(nextchr);
2637 }
2638 else {
2639 ln = isALNUM_LC(ln);
2640 n = isALNUM_LC(nextchr);
2641 }
d6a28714 2642 }
ffc61ed2
JH
2643 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2644 OP(scan) == BOUNDL))
2645 sayNO;
a0ed51b3 2646 break;
d6a28714 2647 case SPACEL:
3280af22 2648 PL_reg_flags |= RF_tainted;
bbce6d69 2649 /* FALL THROUGH */
d6a28714 2650 case SPACE:
9442cb0e 2651 if (!nextchr)
4633a7c4 2652 sayNO;
1aa99e6b 2653 if (do_utf8) {
fd400ab9 2654 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2655 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2656 if (!(OP(scan) == SPACE
3568d838 2657 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2658 : isSPACE_LC_utf8((U8*)locinput)))
2659 {
2660 sayNO;
2661 }
2662 locinput += PL_utf8skip[nextchr];
2663 nextchr = UCHARAT(locinput);
2664 break;
d6a28714 2665 }
ffc61ed2
JH
2666 if (!(OP(scan) == SPACE
2667 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2668 sayNO;
2669 nextchr = UCHARAT(++locinput);
2670 }
2671 else {
2672 if (!(OP(scan) == SPACE
2673 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2674 sayNO;
2675 nextchr = UCHARAT(++locinput);
a0ed51b3 2676 }
a0ed51b3 2677 break;
d6a28714 2678 case NSPACEL:
3280af22 2679 PL_reg_flags |= RF_tainted;
bbce6d69 2680 /* FALL THROUGH */
d6a28714 2681 case NSPACE:
9442cb0e 2682 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2683 sayNO;
1aa99e6b 2684 if (do_utf8) {
8269fa76 2685 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2686 if (OP(scan) == NSPACE
3568d838 2687 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2688 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2689 {
2690 sayNO;
2691 }
2692 locinput += PL_utf8skip[nextchr];
2693 nextchr = UCHARAT(locinput);
2694 break;
a0ed51b3 2695 }
ffc61ed2 2696 if (OP(scan) == NSPACE
d6a28714 2697 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2698 sayNO;
76e3520e 2699 nextchr = UCHARAT(++locinput);
a0d0e21e 2700 break;
d6a28714 2701 case DIGITL:
a0ed51b3
LW
2702 PL_reg_flags |= RF_tainted;
2703 /* FALL THROUGH */
d6a28714 2704 case DIGIT:
9442cb0e 2705 if (!nextchr)
a0ed51b3 2706 sayNO;
1aa99e6b 2707 if (do_utf8) {
8269fa76 2708 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2709 if (!(OP(scan) == DIGIT
3568d838 2710 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2711 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2712 {
a0ed51b3 2713 sayNO;
dfe13c55 2714 }
6f06b55f 2715 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2716 nextchr = UCHARAT(locinput);
2717 break;
2718 }
ffc61ed2 2719 if (!(OP(scan) == DIGIT
9442cb0e 2720 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2721 sayNO;
2722 nextchr = UCHARAT(++locinput);
2723 break;
d6a28714 2724 case NDIGITL:
b8c5462f
JH
2725 PL_reg_flags |= RF_tainted;
2726 /* FALL THROUGH */
d6a28714 2727 case NDIGIT:
9442cb0e 2728 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2729 sayNO;
1aa99e6b 2730 if (do_utf8) {
8269fa76 2731 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2732 if (OP(scan) == NDIGIT
3568d838 2733 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2734 : isDIGIT_LC_utf8((U8*)locinput))
2735 {
a0ed51b3 2736 sayNO;
9442cb0e 2737 }
6f06b55f 2738 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2739 nextchr = UCHARAT(locinput);
2740 break;
2741 }
ffc61ed2 2742 if (OP(scan) == NDIGIT
9442cb0e 2743 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2744 sayNO;
2745 nextchr = UCHARAT(++locinput);
2746 break;
2747 case CLUMP:
b7c83a7e 2748 if (locinput >= PL_regeol)
a0ed51b3 2749 sayNO;
b7c83a7e
JH
2750 if (do_utf8) {
2751 LOAD_UTF8_CHARCLASS(mark,"~");
2752 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2753 sayNO;
2754 locinput += PL_utf8skip[nextchr];
2755 while (locinput < PL_regeol &&
2756 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2757 locinput += UTF8SKIP(locinput);
2758 if (locinput > PL_regeol)
2759 sayNO;
eb08e2da
JH
2760 }
2761 else
2762 locinput++;
a0ed51b3
LW
2763 nextchr = UCHARAT(locinput);
2764 break;
c8756f30 2765 case REFFL:
3280af22 2766 PL_reg_flags |= RF_tainted;
c8756f30 2767 /* FALL THROUGH */
c277df42 2768 case REF:
c8756f30 2769 case REFF:
c277df42 2770 n = ARG(scan); /* which paren pair */
cf93c79d 2771 ln = PL_regstartp[n];
2c2d71f5 2772 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2773 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2774 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2775 if (ln == PL_regendp[n])
a0d0e21e 2776 break;
a0ed51b3 2777
cf93c79d 2778 s = PL_bostr + ln;
1aa99e6b 2779 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2780 char *l = locinput;
cf93c79d 2781 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2782 /*
2783 * Note that we can't do the "other character" lookup trick as
2784 * in the 8-bit case (no pun intended) because in Unicode we
2785 * have to map both upper and title case to lower case.
2786 */
2787 if (OP(scan) == REFF) {
a2a2844f 2788 STRLEN ulen1, ulen2;
e7ae6809
JH
2789 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2790 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2791 while (s < e) {
2792 if (l >= PL_regeol)
2793 sayNO;
a2a2844f
JH
2794 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2795 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2796 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2797 sayNO;
a2a2844f
JH
2798 s += ulen1;
2799 l += ulen2;
a0ed51b3
LW
2800 }
2801 }
2802 locinput = l;
2803 nextchr = UCHARAT(locinput);
2804 break;
2805 }
2806
a0d0e21e 2807 /* Inline the first character, for speed. */
76e3520e 2808 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2809 (OP(scan) == REF ||
2810 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2811 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2812 sayNO;
cf93c79d 2813 ln = PL_regendp[n] - ln;
3280af22 2814 if (locinput + ln > PL_regeol)
4633a7c4 2815 sayNO;
c8756f30
AK
2816 if (ln > 1 && (OP(scan) == REF
2817 ? memNE(s, locinput, ln)
2818 : (OP(scan) == REFF
2819 ? ibcmp(s, locinput, ln)
2820 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2821 sayNO;
a0d0e21e 2822 locinput += ln;
76e3520e 2823 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2824 break;
2825
2826 case NOTHING:
c277df42 2827 case TAIL:
a0d0e21e
LW
2828 break;
2829 case BACK:
2830 break;
c277df42
IZ
2831 case EVAL:
2832 {
2833 dSP;
533c011a 2834 OP_4tree *oop = PL_op;
3280af22 2835 COP *ocurcop = PL_curcop;
f3548bdc 2836 PAD *old_comppad;
c277df42 2837 SV *ret;
080c2dec 2838 struct regexp *oreg = PL_reg_re;
9041c2e3 2839
c277df42 2840 n = ARG(scan);
533c011a 2841 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2842 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2843 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2844 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2845
8e5e9ebe
RGS
2846 {
2847 SV **before = SP;
2848 CALLRUNOPS(aTHX); /* Scalar context. */
2849 SPAGAIN;
2850 if (SP == before)
075aa684 2851 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2852 else {
2853 ret = POPs;
2854 PUTBACK;
2855 }
2856 }
2857
0f5d15d6 2858 PL_op = oop;
f3548bdc 2859 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2860 PL_curcop = ocurcop;
c277df42 2861 if (logical) {
0f5d15d6
IZ
2862 if (logical == 2) { /* Postponed subexpression. */
2863 regexp *re;
22c35a8c 2864 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2865 re_cc_state state;
0f5d15d6 2866 CHECKPOINT cp, lastcp;
cb50f42d 2867 int toggleutf;
faf82a0b 2868 register SV *sv;
0f5d15d6 2869
faf82a0b
AE
2870 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2871 mg = mg_find(sv, PERL_MAGIC_qr);
2872 else if (SvSMAGICAL(ret)) {
2873 if (SvGMAGICAL(ret))
2874 sv_unmagic(ret, PERL_MAGIC_qr);
2875 else
2876 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2877 }
faf82a0b 2878
0f5d15d6
IZ
2879 if (mg) {
2880 re = (regexp *)mg->mg_obj;
df0003d4 2881 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2882 }
2883 else {
2884 STRLEN len;
2885 char *t = SvPV(ret, len);
2886 PMOP pm;
2887 char *oprecomp = PL_regprecomp;
2888 I32 osize = PL_regsize;
2889 I32 onpar = PL_regnpar;
2890
5fcd1c1b 2891 Zero(&pm, 1, PMOP);
cb50f42d 2892 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2893 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2894 if (!(SvFLAGS(ret)
faf82a0b
AE
2895 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2896 | SVs_GMG)))
14befaf4
DM
2897 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2898 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2899 PL_regprecomp = oprecomp;
2900 PL_regsize = osize;
2901 PL_regnpar = onpar;
2902 }
2903 DEBUG_r(
9041c2e3 2904 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2905 "Entering embedded `%s%.60s%s%s'\n",
2906 PL_colors[0],
2907 re->precomp,
2908 PL_colors[1],
2909 (strlen(re->precomp) > 60 ? "..." : ""))
2910 );
2911 state.node = next;
2912 state.prev = PL_reg_call_cc;
2913 state.cc = PL_regcc;
2914 state.re = PL_reg_re;
2915
2ab05381 2916 PL_regcc = 0;
9041c2e3 2917
0f5d15d6 2918 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2919 REGCP_SET(lastcp);
0f5d15d6
IZ
2920 cache_re(re);
2921 state.ss = PL_savestack_ix;
2922 *PL_reglastparen = 0;
a01268b5 2923 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2924 PL_reg_call_cc = &state;
2925 PL_reginput = locinput;
cb50f42d
YST
2926 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2927 ((re->reganch & ROPT_UTF8) != 0);
2928 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2929
2930 /* XXXX This is too dramatic a measure... */
2931 PL_reg_maxiter = 0;
2932
0f5d15d6 2933 if (regmatch(re->program + 1)) {
2c914db6
IZ
2934 /* Even though we succeeded, we need to restore
2935 global variables, since we may be wrapped inside
2936 SUSPEND, thus the match may be not finished yet. */
2937
2938 /* XXXX Do this only if SUSPENDed? */
2939 PL_reg_call_cc = state.prev;
2940 PL_regcc = state.cc;
2941 PL_reg_re = state.re;
2942 cache_re(PL_reg_re);
cb50f42d 2943 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2944
2945 /* XXXX This is too dramatic a measure... */
2946 PL_reg_maxiter = 0;
2947
2948 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2949 ReREFCNT_dec(re);
2950 regcpblow(cp);
2951 sayYES;
2952 }
0f5d15d6 2953 ReREFCNT_dec(re);
02db2b7b 2954 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2955 regcppop();
2956 PL_reg_call_cc = state.prev;
2957 PL_regcc = state.cc;
2958 PL_reg_re = state.re;
d3790889 2959 cache_re(PL_reg_re);
cb50f42d 2960 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2961
2962 /* XXXX This is too dramatic a measure... */
2963 PL_reg_maxiter = 0;
2964
8e514ae6 2965 logical = 0;
0f5d15d6
IZ
2966 sayNO;
2967 }
c277df42 2968 sw = SvTRUE(ret);
0f5d15d6 2969 logical = 0;
a0ed51b3 2970 }
080c2dec 2971 else {
3280af22 2972 sv_setsv(save_scalar(PL_replgv), ret);
080c2dec
AE
2973 cache_re(oreg);
2974 }
c277df42
IZ
2975 break;
2976 }
a0d0e21e 2977 case OPEN:
c277df42 2978 n = ARG(scan); /* which paren pair */
3280af22
NIS
2979 PL_reg_start_tmp[n] = locinput;
2980 if (n > PL_regsize)
2981 PL_regsize = n;
a0d0e21e
LW
2982 break;
2983 case CLOSE:
c277df42 2984 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2985 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2986 PL_regendp[n] = locinput - PL_bostr;
eb160463 2987 if (n > (I32)*PL_reglastparen)
3280af22 2988 *PL_reglastparen = n;
a01268b5 2989 *PL_reglastcloseparen = n;
a0d0e21e 2990 break;
c277df42
IZ
2991 case GROUPP:
2992 n = ARG(scan); /* which paren pair */
eb160463 2993 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2994 break;
2995 case IFTHEN:
2c2d71f5 2996 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2997 if (sw)
2998 next = NEXTOPER(NEXTOPER(scan));
2999 else {
3000 next = scan + ARG(scan);
3001 if (OP(next) == IFTHEN) /* Fake one. */
3002 next = NEXTOPER(NEXTOPER(next));
3003 }
3004 break;
3005 case LOGICAL:
0f5d15d6 3006 logical = scan->flags;
c277df42 3007 break;
2ab05381
IZ
3008/*******************************************************************
3009 PL_regcc contains infoblock about the innermost (...)* loop, and
3010 a pointer to the next outer infoblock.
3011
3012 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3013
3014 1) After matching X, regnode for CURLYX is processed;
3015
9041c2e3 3016 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3017 regmatch() recursively with the starting point at WHILEM node;
3018
3019 3) Each hit of WHILEM node tries to match A and Z (in the order
3020 depending on the current iteration, min/max of {min,max} and
3021 greediness). The information about where are nodes for "A"
3022 and "Z" is read from the infoblock, as is info on how many times "A"
3023 was already matched, and greediness.
3024
3025 4) After A matches, the same WHILEM node is hit again.
3026
3027 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3028 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3029 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3030 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3031 of the external loop.
3032
3033 Currently present infoblocks form a tree with a stem formed by PL_curcc
3034 and whatever it mentions via ->next, and additional attached trees
3035 corresponding to temporarily unset infoblocks as in "5" above.
3036
9041c2e3 3037 In the following picture infoblocks for outer loop of
2ab05381
IZ
3038 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3039 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3040 infoblocks are drawn below the "reset" infoblock.
3041
3042 In fact in the picture below we do not show failed matches for Z and T
3043 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3044 more obvious *why* one needs to *temporary* unset infoblocks.]
3045
3046 Matched REx position InfoBlocks Comment
3047 (Y(A)*?Z)*?T x
3048 Y(A)*?Z)*?T x <- O
3049 Y (A)*?Z)*?T x <- O
3050 Y A)*?Z)*?T x <- O <- I
3051 YA )*?Z)*?T x <- O <- I
3052 YA A)*?Z)*?T x <- O <- I
3053 YAA )*?Z)*?T x <- O <- I
3054 YAA Z)*?T x <- O # Temporary unset I
3055 I
3056
3057 YAAZ Y(A)*?Z)*?T x <- O
3058 I
3059
3060 YAAZY (A)*?Z)*?T x <- O
3061 I
3062
3063 YAAZY A)*?Z)*?T x <- O <- I
3064 I
3065
3066 YAAZYA )*?Z)*?T x <- O <- I
3067 I
3068
3069 YAAZYA Z)*?T x <- O # Temporary unset I
3070 I,I
3071
3072 YAAZYAZ )*?T x <- O
3073 I,I
3074
3075 YAAZYAZ T x # Temporary unset O
3076 O
3077 I,I
3078
3079 YAAZYAZT x
3080 O
3081 I,I
3082 *******************************************************************/
a0d0e21e
LW
3083 case CURLYX: {
3084 CURCUR cc;
3280af22 3085 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3086 /* No need to save/restore up to this paren */
3087 I32 parenfloor = scan->flags;
c277df42
IZ
3088
3089 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3090 next += ARG(next);
3280af22
NIS
3091 cc.oldcc = PL_regcc;
3092 PL_regcc = &cc;
cb434fcc
IZ
3093 /* XXXX Probably it is better to teach regpush to support
3094 parenfloor > PL_regsize... */
eb160463 3095 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3096 parenfloor = *PL_reglastparen; /* Pessimization... */
3097 cc.parenfloor = parenfloor;
a0d0e21e
LW
3098 cc.cur = -1;
3099 cc.min = ARG1(scan);
3100 cc.max = ARG2(scan);
c277df42 3101 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3102 cc.next = next;
3103 cc.minmod = minmod;
3104 cc.lastloc = 0;
3280af22 3105 PL_reginput = locinput;
a0d0e21e
LW
3106 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3107 regcpblow(cp);
3280af22 3108 PL_regcc = cc.oldcc;
4633a7c4 3109 saySAME(n);
a0d0e21e
LW
3110 }
3111 /* NOT REACHED */
3112 case WHILEM: {
3113 /*
3114 * This is really hard to understand, because after we match
3115 * what we're trying to match, we must make sure the rest of
2c2d71f5 3116 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3117 * to go back UP the parse tree by recursing ever deeper. And
3118 * if it fails, we have to reset our parent's current state
3119 * that we can try again after backing off.
3120 */
3121
c277df42 3122 CHECKPOINT cp, lastcp;
3280af22 3123 CURCUR* cc = PL_regcc;
c277df42
IZ
3124 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3125
4633a7c4 3126 n = cc->cur + 1; /* how many we know we matched */
3280af22 3127 PL_reginput = locinput;
a0d0e21e 3128
c277df42 3129 DEBUG_r(
9041c2e3 3130 PerlIO_printf(Perl_debug_log,
91f3b821 3131 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3132 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3133 (long)n, (long)cc->min,
2797576d 3134 (long)cc->max, PTR2UV(cc))
c277df42 3135 );
4633a7c4 3136
a0d0e21e
LW
3137 /* If degenerate scan matches "", assume scan done. */
3138
579cf2c3 3139 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3140 PL_regcc = cc->oldcc;
2ab05381
IZ
3141 if (PL_regcc)
3142 ln = PL_regcc->cur;
c277df42 3143 DEBUG_r(
c3464db5
DD
3144 PerlIO_printf(Perl_debug_log,
3145 "%*s empty match detected, try continuation...\n",
3280af22 3146 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3147 );
a0d0e21e 3148 if (regmatch(cc->next))
4633a7c4 3149 sayYES;
2ab05381
IZ
3150 if (PL_regcc)
3151 PL_regcc->cur = ln;
3280af22 3152 PL_regcc = cc;
4633a7c4 3153 sayNO;
a0d0e21e
LW
3154 }
3155
3156 /* First just match a string of min scans. */
3157
3158 if (n < cc->min) {
3159 cc->cur = n;
3160 cc->lastloc = locinput;
4633a7c4
LW
3161 if (regmatch(cc->scan))
3162 sayYES;
3163 cc->cur = n - 1;
c277df42 3164 cc->lastloc = lastloc;
4633a7c4 3165 sayNO;
a0d0e21e
LW
3166 }
3167
2c2d71f5
JH
3168 if (scan->flags) {
3169 /* Check whether we already were at this position.
3170 Postpone detection until we know the match is not
3171 *that* much linear. */
3172 if (!PL_reg_maxiter) {
3173 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3174 PL_reg_leftiter = PL_reg_maxiter;
3175 }
3176 if (PL_reg_leftiter-- == 0) {
3177 I32 size = (PL_reg_maxiter + 7)/8;
3178 if (PL_reg_poscache) {
eb160463 3179 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3180 Renew(PL_reg_poscache, size, char);
3181 PL_reg_poscache_size = size;
3182 }
3183 Zero(PL_reg_poscache, size, char);
3184 }
3185 else {
3186 PL_reg_poscache_size = size;
3187 Newz(29, PL_reg_poscache, size, char);
3188 }
3189 DEBUG_r(
3190 PerlIO_printf(Perl_debug_log,
3191 "%sDetected a super-linear match, switching on caching%s...\n",
3192 PL_colors[4], PL_colors[5])
3193 );
3194 }
3195 if (PL_reg_leftiter < 0) {
3196 I32 o = locinput - PL_bostr, b;
3197
3198 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3199 b = o % 8;
3200 o /= 8;
3201 if (PL_reg_poscache[o] & (1<<b)) {
3202 DEBUG_r(
3203 PerlIO_printf(Perl_debug_log,
3204 "%*s already tried at this position...\n",
3205 REPORT_CODE_OFF+PL_regindent*2, "")
3206 );
7821416a 3207 sayNO_SILENT;
2c2d71f5
JH
3208 }
3209 PL_reg_poscache[o] |= (1<<b);
3210 }
3211 }
3212
a0d0e21e
LW
3213 /* Prefer next over scan for minimal matching. */
3214
3215 if (cc->minmod) {
3280af22 3216 PL_regcc = cc->oldcc;
2ab05381
IZ
3217 if (PL_regcc)
3218 ln = PL_regcc->cur;
5f05dabc 3219 cp = regcppush(cc->parenfloor);
02db2b7b 3220 REGCP_SET(lastcp);
5f05dabc 3221 if (regmatch(cc->next)) {
c277df42 3222 regcpblow(cp);
4633a7c4 3223 sayYES; /* All done. */
5f05dabc 3224 }
02db2b7b 3225 REGCP_UNWIND(lastcp);
5f05dabc 3226 regcppop();
2ab05381
IZ
3227 if (PL_regcc)
3228 PL_regcc->cur = ln;
3280af22 3229 PL_regcc = cc;
a0d0e21e 3230
c277df42 3231 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3232 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3233 && !(PL_reg_flags & RF_warned)) {
3234 PL_reg_flags |= RF_warned;
9014280d 3235 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3236 "Complex regular subexpression recursion",
3237 REG_INFTY - 1);
c277df42 3238 }
4633a7c4 3239 sayNO;
c277df42 3240 }
a687059c 3241
c277df42 3242 DEBUG_r(
c3464db5
DD
3243 PerlIO_printf(Perl_debug_log,
3244 "%*s trying longer...\n",
3280af22 3245 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3246 );
a0d0e21e 3247 /* Try scanning more and see if it helps. */
3280af22 3248 PL_reginput = locinput;
a0d0e21e
LW
3249 cc->cur = n;
3250 cc->lastloc = locinput;
5f05dabc 3251 cp = regcppush(cc->parenfloor);
02db2b7b 3252 REGCP_SET(lastcp);
5f05dabc 3253 if (regmatch(cc->scan)) {
c277df42 3254 regcpblow(cp);
4633a7c4 3255 sayYES;
5f05dabc 3256 }
02db2b7b 3257 REGCP_UNWIND(lastcp);
5f05dabc 3258 regcppop();
4633a7c4 3259 cc->cur = n - 1;
c277df42 3260 cc->lastloc = lastloc;
4633a7c4 3261 sayNO;
a0d0e21e
LW
3262 }
3263
3264 /* Prefer scan over next for maximal matching. */
3265
3266 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3267 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3268 cc->cur = n;
3269 cc->lastloc = locinput;
02db2b7b 3270 REGCP_SET(lastcp);
5f05dabc 3271 if (regmatch(cc->scan)) {
c277df42 3272 regcpblow(cp);
4633a7c4 3273 sayYES;
5f05dabc 3274 }
02db2b7b 3275 REGCP_UNWIND(lastcp);
a0d0e21e 3276 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3277 PL_reginput = locinput;
c277df42 3278 DEBUG_r(
c3464db5
DD
3279 PerlIO_printf(Perl_debug_log,
3280 "%*s failed, try continuation...\n",
3280af22 3281 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3282 );
3283 }
9041c2e3 3284 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3285 && !(PL_reg_flags & RF_warned)) {
3280af22 3286 PL_reg_flags |= RF_warned;
9014280d 3287 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3288 "Complex regular subexpression recursion",
3289 REG_INFTY - 1);
a0d0e21e
LW
3290 }
3291
3292 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3293 PL_regcc = cc->oldcc;
2ab05381
IZ
3294 if (PL_regcc)
3295 ln = PL_regcc->cur;
a0d0e21e 3296 if (regmatch(cc->next))
4633a7c4 3297 sayYES;
2ab05381
IZ
3298 if (PL_regcc)
3299 PL_regcc->cur = ln;
3280af22 3300 PL_regcc = cc;
4633a7c4 3301 cc->cur = n - 1;
c277df42 3302 cc->lastloc = lastloc;
4633a7c4 3303 sayNO;
a0d0e21e
LW
3304 }
3305 /* NOT REACHED */
9041c2e3 3306 case BRANCHJ:
c277df42
IZ
3307 next = scan + ARG(scan);
3308 if (next == scan)
3309 next = NULL;
3310 inner = NEXTOPER(NEXTOPER(scan));
3311 goto do_branch;
9041c2e3 3312 case BRANCH:
c277df42
IZ
3313 inner = NEXTOPER(scan);
3314 do_branch:
3315 {
c277df42
IZ
3316 c1 = OP(scan);
3317 if (OP(next) != c1) /* No choice. */
3318 next = inner; /* Avoid recursion. */
a0d0e21e 3319 else {
02db2b7b
IZ
3320 I32 lastparen = *PL_reglastparen;
3321 I32 unwind1;
3322 re_unwind_branch_t *uw;
3323
3324 /* Put unwinding data on stack */
3325 unwind1 = SSNEWt(1,re_unwind_branch_t);
3326 uw = SSPTRt(unwind1,re_unwind_branch_t);
3327 uw->prev = unwind;
3328 unwind = unwind1;
3329 uw->type = ((c1 == BRANCH)
3330 ? RE_UNWIND_BRANCH
3331 : RE_UNWIND_BRANCHJ);
3332 uw->lastparen = lastparen;
3333 uw->next = next;
3334 uw->locinput = locinput;
3335 uw->nextchr = nextchr;
3336#ifdef DEBUGGING
3337 uw->regindent = ++PL_regindent;
3338#endif
c277df42 3339
02db2b7b
IZ
3340 REGCP_SET(uw->lastcp);
3341
3342 /* Now go into the first branch */
3343 next = inner;
a687059c 3344 }
a0d0e21e
LW
3345 }
3346 break;
3347 case MINMOD:
3348 minmod = 1;
3349 break;
c277df42
IZ
3350 case CURLYM:
3351 {
00db4c45 3352 I32 l = 0;
c277df42 3353 CHECKPOINT lastcp;
9041c2e3 3354
c277df42
IZ
3355 /* We suppose that the next guy does not need
3356 backtracking: in particular, it is of constant length,
3357 and has no parenths to influence future backrefs. */
3358 ln = ARG1(scan); /* min to match */
3359 n = ARG2(scan); /* max to match */
c277df42
IZ
3360 paren = scan->flags;
3361 if (paren) {
3280af22
NIS
3362 if (paren > PL_regsize)
3363 PL_regsize = paren;
eb160463 3364 if (paren > (I32)*PL_reglastparen)
3280af22 3365 *PL_reglastparen = paren;
c277df42 3366 }
dc45a647 3367 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3368 if (paren)
3369 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3370 PL_reginput = locinput;
c277df42
IZ
3371 if (minmod) {
3372 minmod = 0;
3373 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3374 sayNO;
f31a99c8
HS
3375 /* if we matched something zero-length we don't need to
3376 backtrack - capturing parens are already defined, so
3377 the caveat in the maximal case doesn't apply
3378
3379 XXXX if ln == 0, we can redo this check first time
3380 through the following loop
3381 */
3382 if (ln && l == 0)
3383 n = ln; /* don't backtrack */
3280af22 3384 locinput = PL_reginput;
cca55fe3 3385 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3386 regnode *text_node = next;
3387
cca55fe3 3388 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3389
cca55fe3 3390 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3391 else {
cca55fe3
JP
3392 if (PL_regkind[(U8)OP(text_node)] == REF) {
3393 I32 n, ln;
3394 n = ARG(text_node); /* which paren pair */
3395 ln = PL_regstartp[n];
3396 /* assume yes if we haven't seen CLOSEn */
3397 if (
eb160463 3398 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3399 ln == -1 ||
3400 ln == PL_regendp[n]
3401 ) {
3402 c1 = c2 = -1000;
3403 goto assume_ok_MM;
3404 }
3405 c1 = *(PL_bostr + ln);
3406 }
3407 else { c1 = (U8)*STRING(text_node); }
af5decee 3408 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3409 c2 = PL_fold[c1];
af5decee 3410 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3411 c2 = PL_fold_locale[c1];
3412 else
3413 c2 = c1;
3414 }
a0ed51b3
LW
3415 }
3416 else
c277df42 3417 c1 = c2 = -1000;
cca55fe3 3418 assume_ok_MM:
02db2b7b 3419 REGCP_SET(lastcp);
5f4b28b2 3420 /* This may be improved if l == 0. */
c277df42
IZ
3421 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3422 /* If it could work, try it. */
3423 if (c1 == -1000 ||
3280af22
NIS
3424 UCHARAT(PL_reginput) == c1 ||
3425 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3426 {
3427 if (paren) {
f31a99c8 3428 if (ln) {
cf93c79d
IZ
3429 PL_regstartp[paren] =
3430 HOPc(PL_reginput, -l) - PL_bostr;
3431 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3432 }
3433 else
cf93c79d 3434 PL_regendp[paren] = -1;
c277df42
IZ
3435 }
3436 if (regmatch(next))
3437 sayYES;
02db2b7b 3438 REGCP_UNWIND(lastcp);
c277df42
IZ
3439 }
3440 /* Couldn't or didn't -- move forward. */
3280af22 3441 PL_reginput = locinput;
c277df42
IZ
3442 if (regrepeat_hard(scan, 1, &l)) {
3443 ln++;
3280af22 3444 locinput = PL_reginput;
c277df42
IZ
3445 }
3446 else
3447 sayNO;
3448 }
a0ed51b3
LW
3449 }
3450 else {
c277df42 3451 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3452 /* if we matched something zero-length we don't need to
3453 backtrack, unless the minimum count is zero and we
3454 are capturing the result - in that case the capture
3455 being defined or not may affect later execution
3456 */
3457 if (n != 0 && l == 0 && !(paren && ln == 0))
3458 ln = n; /* don't backtrack */
3280af22 3459 locinput = PL_reginput;
c277df42 3460 DEBUG_r(
5c0ca799 3461 PerlIO_printf(Perl_debug_log,
faccc32b 3462 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3463 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3464 (IV) n, (IV)l)
c277df42
IZ
3465 );
3466 if (n >= ln) {
cca55fe3 3467 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3468 regnode *text_node = next;
3469
cca55fe3 3470 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3471
cca55fe3 3472 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3473 else {
cca55fe3
JP
3474 if (PL_regkind[(U8)OP(text_node)] == REF) {
3475 I32 n, ln;
3476 n = ARG(text_node); /* which paren pair */
3477 ln = PL_regstartp[n];
3478 /* assume yes if we haven't seen CLOSEn */
3479 if (
eb160463 3480 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3481 ln == -1 ||
3482 ln == PL_regendp[n]
3483 ) {
3484 c1 = c2 = -1000;
3485 goto assume_ok_REG;
3486 }
3487 c1 = *(PL_bostr + ln);
3488 }
3489 else { c1 = (U8)*STRING(text_node); }
3490
af5decee 3491 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3492 c2 = PL_fold[c1];
af5decee 3493 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3494 c2 = PL_fold_locale[c1];
3495 else
3496 c2 = c1;
3497 }
a0ed51b3
LW
3498 }
3499 else
c277df42
IZ
3500 c1 = c2 = -1000;
3501 }
cca55fe3 3502 assume_ok_REG:
02db2b7b 3503 REGCP_SET(lastcp);
c277df42
IZ
3504 while (n >= ln) {
3505 /* If it could work, try it. */
3506 if (c1 == -1000 ||
3280af22
NIS
3507 UCHARAT(PL_reginput) == c1 ||
3508 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3509 {
3510 DEBUG_r(
c3464db5 3511 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3512 "%*s trying tail with n=%"IVdf"...\n",
3513 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3514 );
3515 if (paren) {
3516 if (n) {
cf93c79d
IZ
3517 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3518 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3519 }
a0ed51b3 3520 else
cf93c79d 3521 PL_regendp[paren] = -1;
c277df42 3522 }
a0ed51b3
LW
3523 if (regmatch(next))
3524 sayYES;
02db2b7b 3525 REGCP_UNWIND(lastcp);
a0ed51b3 3526 }
c277df42
IZ
3527 /* Couldn't or didn't -- back up. */
3528 n--;
dfe13c55 3529 locinput = HOPc(locinput, -l);
3280af22 3530 PL_reginput = locinput;
c277df42
IZ
3531 }
3532 }
3533 sayNO;
3534 break;
3535 }
3536 case CURLYN:
3537 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3538 if (paren > PL_regsize)
3539 PL_regsize = paren;
eb160463 3540 if (paren > (I32)*PL_reglastparen)
3280af22 3541 *PL_reglastparen = paren;
c277df42
IZ
3542 ln = ARG1(scan); /* min to match */
3543 n = ARG2(scan); /* max to match */
dc45a647 3544 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3545 goto repeat;
a0d0e21e 3546 case CURLY:
c277df42 3547 paren = 0;
a0d0e21e
LW
3548 ln = ARG1(scan); /* min to match */
3549 n = ARG2(scan); /* max to match */
dc45a647 3550 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3551 goto repeat;
3552 case STAR:
3553 ln = 0;
c277df42 3554 n = REG_INFTY;
a0d0e21e 3555 scan = NEXTOPER(scan);
c277df42 3556 paren = 0;
a0d0e21e
LW
3557 goto repeat;
3558 case PLUS:
c277df42
IZ
3559 ln = 1;
3560 n = REG_INFTY;
3561 scan = NEXTOPER(scan);
3562 paren = 0;
3563 repeat:
a0d0e21e
LW
3564 /*
3565 * Lookahead to avoid useless match attempts
3566 * when we know what character comes next.
3567 */
5f80c4cf
JP
3568
3569 /*
3570 * Used to only do .*x and .*?x, but now it allows
3571 * for )'s, ('s and (?{ ... })'s to be in the way
3572 * of the quantifier and the EXACT-like node. -- japhy
3573 */
3574
cca55fe3 3575 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3576 U8 *s;
3577 regnode *text_node = next;
3578
cca55fe3 3579 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3580
cca55fe3 3581 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3582 else {
cca55fe3
JP
3583 if (PL_regkind[(U8)OP(text_node)] == REF) {
3584 I32 n, ln;
3585 n = ARG(text_node); /* which paren pair */
3586 ln = PL_regstartp[n];
3587 /* assume yes if we haven't seen CLOSEn */
3588 if (
eb160463 3589 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3590 ln == -1 ||
3591 ln == PL_regendp[n]
3592 ) {
3593 c1 = c2 = -1000;
3594 goto assume_ok_easy;
3595 }
9246c65e 3596 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3597 }
3598 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3599
3600 if (!UTF) {
3601 c2 = c1 = *s;
f65d3ee7 3602 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3603 c2 = PL_fold[c1];
f65d3ee7 3604 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3605 c2 = PL_fold_locale[c1];
1aa99e6b 3606 }
5f80c4cf 3607 else { /* UTF */
f65d3ee7 3608 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3609 STRLEN ulen1, ulen2;
e7ae6809
JH
3610 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3611 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3612
3613 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3614 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3615
872c91ae
JH
3616 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3617 ckWARN(WARN_UTF8) ?
3618 0 : UTF8_ALLOW_ANY);
3619 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3620 ckWARN(WARN_UTF8) ?
3621 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3622 }
3623 else {
872c91ae
JH
3624 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3625 ckWARN(WARN_UTF8) ?
3626 0 : UTF8_ALLOW_ANY);
5f80c4cf 3627 }
1aa99e6b
IH
3628 }
3629 }
bbce6d69 3630 }
a0d0e21e 3631 else
bbce6d69 3632 c1 = c2 = -1000;
cca55fe3 3633 assume_ok_easy:
3280af22 3634 PL_reginput = locinput;
a0d0e21e 3635 if (minmod) {
c277df42 3636 CHECKPOINT lastcp;
a0d0e21e
LW
3637 minmod = 0;
3638 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3639 sayNO;
a0ed51b3 3640 locinput = PL_reginput;
02db2b7b 3641 REGCP_SET(lastcp);
0fe9bf95 3642 if (c1 != -1000) {
1aa99e6b 3643 char *e; /* Should not check after this */
0fe9bf95 3644 char *old = locinput;
b2f2f093 3645 int count = 0;
0fe9bf95 3646
1aa99e6b 3647 if (n == REG_INFTY) {
0fe9bf95 3648 e = PL_regeol - 1;
1aa99e6b
IH
3649 if (do_utf8)
3650 while (UTF8_IS_CONTINUATION(*(U8*)e))
3651 e--;
3652 }
3653 else if (do_utf8) {
3654 int m = n - ln;
3655 for (e = locinput;
3656 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3657 e += UTF8SKIP(e);
3658 }
3659 else {
3660 e = locinput + n - ln;
3661 if (e >= PL_regeol)
3662 e = PL_regeol - 1;
3663 }
0fe9bf95
IZ
3664 while (1) {
3665 /* Find place 'next' could work */
1aa99e6b
IH
3666 if (!do_utf8) {
3667 if (c1 == c2) {
a8e8ab15
JH
3668 while (locinput <= e &&
3669 UCHARAT(locinput) != c1)
1aa99e6b
IH
3670 locinput++;
3671 } else {
9041c2e3 3672 while (locinput <= e
a8e8ab15
JH
3673 && UCHARAT(locinput) != c1
3674 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3675 locinput++;
3676 }
3677 count = locinput - old;
3678 }
3679 else {
3680 STRLEN len;
3681 if (c1 == c2) {
872c91ae
JH
3682 /* count initialised to
3683 * utf8_distance(old, locinput) */
b2f2f093 3684 while (locinput <= e &&
872c91ae
JH
3685 utf8n_to_uvchr((U8*)locinput,
3686 UTF8_MAXLEN, &len,
3687 ckWARN(WARN_UTF8) ?
eb160463 3688 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3689 locinput += len;
b2f2f093
JH
3690 count++;
3691 }
1aa99e6b 3692 } else {
872c91ae
JH
3693 /* count initialised to
3694 * utf8_distance(old, locinput) */
b2f2f093 3695 while (locinput <= e) {
872c91ae
JH
3696 UV c = utf8n_to_uvchr((U8*)locinput,
3697 UTF8_MAXLEN, &len,
3698 ckWARN(WARN_UTF8) ?
3699 0 : UTF8_ALLOW_ANY);
eb160463 3700 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3701 break;
b2f2f093
JH
3702 locinput += len;
3703 count++;
1aa99e6b
IH
3704 }
3705 }
0fe9bf95 3706 }
9041c2e3 3707 if (locinput > e)
0fe9bf95
IZ
3708 sayNO;
3709 /* PL_reginput == old now */
3710 if (locinput != old) {
3711 ln = 1; /* Did some */
1aa99e6b 3712 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3713 sayNO;
3714 }
3715 /* PL_reginput == locinput now */
29d1e993 3716 TRYPAREN(paren, ln, locinput);
0fe9bf95 3717 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3718 REGCP_UNWIND(lastcp);
0fe9bf95 3719 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3720 old = locinput;
3721 if (do_utf8)
3722 locinput += UTF8SKIP(locinput);
3723 else
3724 locinput++;
b2f2f093 3725 count = 1;
0fe9bf95
IZ
3726 }
3727 }
3728 else
c277df42 3729 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3730 UV c;
3731 if (c1 != -1000) {
3732 if (do_utf8)
872c91ae
JH
3733 c = utf8n_to_uvchr((U8*)PL_reginput,
3734 UTF8_MAXLEN, 0,
3735 ckWARN(WARN_UTF8) ?
3736 0 : UTF8_ALLOW_ANY);
1aa99e6b 3737 else
9041c2e3 3738 c = UCHARAT(PL_reginput);
2390ecbc 3739 /* If it could work, try it. */
eb160463 3740 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3741 {
ecc99935 3742 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3743 REGCP_UNWIND(lastcp);
3744 }
1aa99e6b 3745 }
a0d0e21e 3746 /* If it could work, try it. */
2390ecbc 3747 else if (c1 == -1000)
bbce6d69 3748 {
ecc99935 3749 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3750 REGCP_UNWIND(lastcp);
bbce6d69 3751 }
c277df42 3752 /* Couldn't or didn't -- move forward. */
a0ed51b3 3753 PL_reginput = locinput;
a0d0e21e
LW
3754 if (regrepeat(scan, 1)) {
3755 ln++;
a0ed51b3
LW
3756 locinput = PL_reginput;
3757 }
3758 else
4633a7c4 3759 sayNO;
a0d0e21e
LW
3760 }
3761 }
3762 else {
c277df42 3763 CHECKPOINT lastcp;
a0d0e21e 3764 n = regrepeat(scan, n);
a0ed51b3 3765 locinput = PL_reginput;
22c35a8c 3766 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3767 ((!PL_multiline && OP(next) != MEOL) ||
3768 OP(next) == SEOL || OP(next) == EOS))
3769 {
a0d0e21e 3770 ln = n; /* why back off? */
1aeab75a
GS
3771 /* ...because $ and \Z can match before *and* after
3772 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3773 We should back off by one in this case. */
3774 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3775 ln--;
3776 }
02db2b7b 3777 REGCP_SET(lastcp);
c277df42 3778 if (paren) {
8fa7f367 3779 UV c = 0;
c277df42 3780 while (n >= ln) {
1aa99e6b
IH
3781 if (c1 != -1000) {
3782 if (do_utf8)
872c91ae
JH
3783 c = utf8n_to_uvchr((U8*)PL_reginput,
3784 UTF8_MAXLEN, 0,
3785 ckWARN(WARN_UTF8) ?
3786 0 : UTF8_ALLOW_ANY);
1aa99e6b 3787 else
9041c2e3 3788 c = UCHARAT(PL_reginput);
1aa99e6b 3789 }
c277df42 3790 /* If it could work, try it. */
eb160463 3791 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3792 {
29d1e993 3793 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3794 REGCP_UNWIND(lastcp);
c277df42
IZ
3795 }
3796 /* Couldn't or didn't -- back up. */
3797 n--;
dfe13c55 3798 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3799 }
a0ed51b3
LW
3800 }
3801 else {
8fa7f367 3802 UV c = 0;
c277df42 3803 while (n >= ln) {
1aa99e6b
IH
3804 if (c1 != -1000) {
3805 if (do_utf8)
872c91ae
JH
3806 c = utf8n_to_uvchr((U8*)PL_reginput,
3807 UTF8_MAXLEN, 0,
3808 ckWARN(WARN_UTF8) ?
3809 0 : UTF8_ALLOW_ANY);
1aa99e6b 3810 else
9041c2e3 3811 c = UCHARAT(PL_reginput);
1aa99e6b 3812 }
c277df42 3813 /* If it could work, try it. */
eb160463 3814 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3815 {
29d1e993 3816 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3817 REGCP_UNWIND(lastcp);
c277df42
IZ
3818 }
3819 /* Couldn't or didn't -- back up. */
3820 n--;
dfe13c55 3821 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3822 }
a0d0e21e
LW
3823 }
3824 }
4633a7c4 3825 sayNO;
c277df42 3826 break;
a0d0e21e 3827 case END:
0f5d15d6
IZ
3828 if (PL_reg_call_cc) {
3829 re_cc_state *cur_call_cc = PL_reg_call_cc;
3830 CURCUR *cctmp = PL_regcc;
3831 regexp *re = PL_reg_re;
3832 CHECKPOINT cp, lastcp;
3833
3834 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3835 REGCP_SET(lastcp);
0f5d15d6
IZ
3836 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3837 the caller. */
3838 PL_reginput = locinput; /* Make position available to
3839 the callcc. */
3840 cache_re(PL_reg_call_cc->re);
3841 PL_regcc = PL_reg_call_cc->cc;
3842 PL_reg_call_cc = PL_reg_call_cc->prev;
3843 if (regmatch(cur_call_cc->node)) {
3844 PL_reg_call_cc = cur_call_cc;
3845 regcpblow(cp);
3846 sayYES;
3847 }
02db2b7b 3848 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3849 regcppop();
3850 PL_reg_call_cc = cur_call_cc;
3851 PL_regcc = cctmp;
3852 PL_reg_re = re;
3853 cache_re(re);
3854
3855 DEBUG_r(
3856 PerlIO_printf(Perl_debug_log,
3857 "%*s continuation failed...\n",
3858 REPORT_CODE_OFF+PL_regindent*2, "")
3859 );
7821416a 3860 sayNO_SILENT;
0f5d15d6 3861 }
7821416a
IZ
3862 if (locinput < PL_regtill) {
3863 DEBUG_r(PerlIO_printf(Perl_debug_log,
3864 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3865 PL_colors[4],
3866 (long)(locinput - PL_reg_starttry),
3867 (long)(PL_regtill - PL_reg_starttry),
3868 PL_colors[5]));
3869 sayNO_FINAL; /* Cannot match: too short. */
3870 }
3871 PL_reginput = locinput; /* put where regtry can find it */
3872 sayYES_FINAL; /* Success! */
7e5428c5 3873 case SUCCEED:
3280af22 3874 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3875 sayYES_LOUD; /* Success! */
c277df42
IZ
3876 case SUSPEND:
3877 n = 1;
9fe1d20c 3878 PL_reginput = locinput;
9041c2e3 3879 goto do_ifmatch;
a0d0e21e 3880 case UNLESSM:
c277df42 3881 n = 0;
a0ed51b3 3882 if (scan->flags) {
efb30f32
HS
3883 s = HOPBACKc(locinput, scan->flags);
3884 if (!s)
3885 goto say_yes;
3886 PL_reginput = s;
a0ed51b3
LW
3887 }
3888 else
3889 PL_reginput = locinput;
c277df42
IZ
3890 goto do_ifmatch;
3891 case IFMATCH:
3892 n = 1;
a0ed51b3 3893 if (scan->flags) {
efb30f32
HS
3894 s = HOPBACKc(locinput, scan->flags);
3895 if (!s)
3896 goto say_no;
3897 PL_reginput = s;
a0ed51b3
LW
3898 }
3899 else
3900 PL_reginput = locinput;
3901
c277df42 3902 do_ifmatch:
c277df42
IZ
3903 inner = NEXTOPER(NEXTOPER(scan));
3904 if (regmatch(inner) != n) {
3905 say_no:
3906 if (logical) {
3907 logical = 0;
3908 sw = 0;
3909 goto do_longjump;
a0ed51b3
LW
3910 }
3911 else
c277df42
IZ
3912 sayNO;
3913 }
3914 say_yes:
3915 if (logical) {
3916 logical = 0;
3917 sw = 1;
3918 }
fe44a5e8 3919 if (OP(scan) == SUSPEND) {
3280af22 3920 locinput = PL_reginput;
565764a8 3921 nextchr = UCHARAT(locinput);
fe44a5e8 3922 }
c277df42
IZ
3923 /* FALL THROUGH. */
3924 case LONGJMP:
3925 do_longjump:
3926 next = scan + ARG(scan);
3927 if (next == scan)
3928 next = NULL;
a0d0e21e
LW
3929 break;
3930 default:
b900a521 3931 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3932 PTR2UV(scan), OP(scan));
cea2e8a9 3933 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3934 }
02db2b7b 3935 reenter:
a0d0e21e
LW
3936 scan = next;
3937 }
a687059c 3938
a0d0e21e
LW
3939 /*
3940 * We get here only if there's trouble -- normally "case END" is
3941 * the terminating point.
3942 */
cea2e8a9 3943 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3944 /*NOTREACHED*/
4633a7c4
LW
3945 sayNO;
3946
7821416a
IZ
3947yes_loud:
3948 DEBUG_r(
3949 PerlIO_printf(Perl_debug_log,
3950 "%*s %scould match...%s\n",
3951 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3952 );
3953 goto yes;
3954yes_final:
3955 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3956 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3957yes:
3958#ifdef DEBUGGING
3280af22 3959 PL_regindent--;
4633a7c4 3960#endif
02db2b7b
IZ
3961
3962#if 0 /* Breaks $^R */
3963 if (unwind)
3964 regcpblow(firstcp);
3965#endif
4633a7c4
LW
3966 return 1;
3967
3968no:
7821416a
IZ
3969 DEBUG_r(
3970 PerlIO_printf(Perl_debug_log,
3971 "%*s %sfailed...%s\n",
3972 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3973 );
3974 goto do_no;
3975no_final:
3976do_no:
02db2b7b
IZ
3977 if (unwind) {
3978 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3979
3980 switch (uw->type) {
3981 case RE_UNWIND_BRANCH:
3982 case RE_UNWIND_BRANCHJ:
3983 {
3984 re_unwind_branch_t *uwb = &(uw->branch);
3985 I32 lastparen = uwb->lastparen;
9041c2e3 3986
02db2b7b
IZ
3987 REGCP_UNWIND(uwb->lastcp);
3988 for (n = *PL_reglastparen; n > lastparen; n--)
3989 PL_regendp[n] = -1;
3990 *PL_reglastparen = n;
3991 scan = next = uwb->next;
9041c2e3
NIS
3992 if ( !scan ||
3993 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3994 ? BRANCH : BRANCHJ) ) { /* Failure */
3995 unwind = uwb->prev;
3996#ifdef DEBUGGING
3997 PL_regindent--;
3998#endif
3999 goto do_no;
4000 }
4001 /* Have more choice yet. Reuse the same uwb. */
4002 /*SUPPRESS 560*/
4003 if ((n = (uwb->type == RE_UNWIND_BRANCH
4004 ? NEXT_OFF(next) : ARG(next))))
4005 next += n;
4006 else
4007 next = NULL; /* XXXX Needn't unwinding in this case... */
4008 uwb->next = next;
4009 next = NEXTOPER(scan);
4010 if (uwb->type == RE_UNWIND_BRANCHJ)
4011 next = NEXTOPER(next);
4012 locinput = uwb->locinput;
4013 nextchr = uwb->nextchr;
4014#ifdef DEBUGGING
4015 PL_regindent = uwb->regindent;
4016#endif
4017
4018 goto reenter;
4019 }
4020 /* NOT REACHED */
4021 default:
4022 Perl_croak(aTHX_ "regexp unwind memory corruption");
4023 }
4024 /* NOT REACHED */
4025 }
4633a7c4 4026#ifdef DEBUGGING
3280af22 4027 PL_regindent--;
4633a7c4 4028#endif
a0d0e21e 4029 return 0;
a687059c
LW
4030}
4031
4032/*
4033 - regrepeat - repeatedly match something simple, report how many
4034 */
4035/*
4036 * [This routine now assumes that it will only match on things of length 1.
4037 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4038 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4039 */
76e3520e 4040STATIC I32
cea2e8a9 4041S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4042{
a0d0e21e 4043 register char *scan;
a0d0e21e 4044 register I32 c;
3280af22 4045 register char *loceol = PL_regeol;
a0ed51b3 4046 register I32 hardcount = 0;
53c4c00c 4047 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4048
3280af22 4049 scan = PL_reginput;
faf11cac
HS
4050 if (max == REG_INFTY)
4051 max = I32_MAX;
4052 else if (max < loceol - scan)
a0d0e21e 4053 loceol = scan + max;
a0d0e21e 4054 switch (OP(p)) {
22c35a8c 4055 case REG_ANY:
1aa99e6b 4056 if (do_utf8) {
ffc61ed2 4057 loceol = PL_regeol;
1aa99e6b 4058 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4059 scan += UTF8SKIP(scan);
4060 hardcount++;
4061 }
4062 } else {
4063 while (scan < loceol && *scan != '\n')
4064 scan++;
a0ed51b3
LW
4065 }
4066 break;
ffc61ed2 4067 case SANY:
def8e4ea
JH
4068 if (do_utf8) {
4069 loceol = PL_regeol;
a0804c9e 4070 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4071 scan += UTF8SKIP(scan);
4072 hardcount++;
4073 }
4074 }
4075 else
4076 scan = loceol;
a0ed51b3 4077 break;
f33976b4
DB
4078 case CANY:
4079 scan = loceol;
4080 break;
bbce6d69 4081 case EXACT: /* length of string is 1 */
cd439c50 4082 c = (U8)*STRING(p);
bbce6d69 4083 while (scan < loceol && UCHARAT(scan) == c)
4084 scan++;
4085 break;
4086 case EXACTF: /* length of string is 1 */
cd439c50 4087 c = (U8)*STRING(p);
bbce6d69 4088 while (scan < loceol &&
22c35a8c 4089 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4090 scan++;
4091 break;
4092 case EXACTFL: /* length of string is 1 */
3280af22 4093 PL_reg_flags |= RF_tainted;
cd439c50 4094 c = (U8)*STRING(p);
bbce6d69 4095 while (scan < loceol &&
22c35a8c 4096 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4097 scan++;
4098 break;
4099 case ANYOF:
ffc61ed2
JH
4100 if (do_utf8) {
4101 loceol = PL_regeol;
cfc92286
JH
4102 while (hardcount < max && scan < loceol &&
4103 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4104 scan += UTF8SKIP(scan);
4105 hardcount++;
4106 }
4107 } else {
7d3e948e 4108 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4109 scan++;
4110 }
a0d0e21e
LW
4111 break;
4112 case ALNUM:
1aa99e6b 4113 if (do_utf8) {
ffc61ed2 4114 loceol = PL_regeol;
8269fa76 4115 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4116 while (hardcount < max && scan < loceol &&
3568d838 4117 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4118 scan += UTF8SKIP(scan);
4119 hardcount++;
4120 }
4121 } else {
4122 while (scan < loceol && isALNUM(*scan))
4123 scan++;
a0ed51b3
LW
4124 }
4125 break;
bbce6d69 4126 case ALNUML:
3280af22 4127 PL_reg_flags |= RF_tainted;
1aa99e6b 4128 if (do_utf8) {
ffc61ed2 4129 loceol = PL_regeol;
1aa99e6b
IH
4130 while (hardcount < max && scan < loceol &&
4131 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4132 scan += UTF8SKIP(scan);
4133 hardcount++;
4134 }
4135 } else {
4136 while (scan < loceol && isALNUM_LC(*scan))
4137 scan++;
a0ed51b3
LW
4138 }
4139 break;
a0d0e21e 4140 case NALNUM:
1aa99e6b 4141 if (do_utf8) {
ffc61ed2 4142 loceol = PL_regeol;
8269fa76 4143 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4144 while (hardcount < max && scan < loceol &&
3568d838 4145 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4146 scan += UTF8SKIP(scan);
4147 hardcount++;
4148 }
4149 } else {
4150 while (scan < loceol && !isALNUM(*scan))
4151 scan++;
a0ed51b3
LW
4152 }
4153 break;
bbce6d69 4154 case NALNUML:
3280af22 4155 PL_reg_flags |= RF_tainted;
1aa99e6b 4156 if (do_utf8) {
ffc61ed2 4157 loceol = PL_regeol;
1aa99e6b
IH
4158 while (hardcount < max && scan < loceol &&
4159 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4160 scan += UTF8SKIP(scan);
4161 hardcount++;
4162 }
4163 } else {
4164 while (scan < loceol && !isALNUM_LC(*scan))
4165 scan++;
a0ed51b3
LW
4166 }
4167 break;
a0d0e21e 4168 case SPACE:
1aa99e6b 4169 if (do_utf8) {
ffc61ed2 4170 loceol = PL_regeol;
8269fa76 4171 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4172 while (hardcount < max && scan < loceol &&
3568d838
JH
4173 (*scan == ' ' ||
4174 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4175 scan += UTF8SKIP(scan);
4176 hardcount++;
4177 }
4178 } else {
4179 while (scan < loceol && isSPACE(*scan))
4180 scan++;
a0ed51b3
LW
4181 }
4182 break;
bbce6d69 4183 case SPACEL:
3280af22 4184 PL_reg_flags |= RF_tainted;
1aa99e6b 4185 if (do_utf8) {
ffc61ed2 4186 loceol = PL_regeol;
1aa99e6b 4187 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4188 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4189 scan += UTF8SKIP(scan);
4190 hardcount++;
4191 }
4192 } else {
4193 while (scan < loceol && isSPACE_LC(*scan))
4194 scan++;
a0ed51b3
LW
4195 }
4196 break;
a0d0e21e 4197 case NSPACE:
1aa99e6b 4198 if (do_utf8) {
ffc61ed2 4199 loceol = PL_regeol;
8269fa76 4200 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4201 while (hardcount < max && scan < loceol &&
3568d838
JH
4202 !(*scan == ' ' ||
4203 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4204 scan += UTF8SKIP(scan);
4205 hardcount++;
4206 }
4207 } else {
4208 while (scan < loceol && !isSPACE(*scan))
4209 scan++;
4210 break;
a0ed51b3 4211 }
bbce6d69 4212 case NSPACEL:
3280af22 4213 PL_reg_flags |= RF_tainted;
1aa99e6b 4214 if (do_utf8) {
ffc61ed2 4215 loceol = PL_regeol;
1aa99e6b 4216 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4217 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4218 scan += UTF8SKIP(scan);
4219 hardcount++;
4220 }
4221 } else {
4222 while (scan < loceol && !isSPACE_LC(*scan))
4223 scan++;
a0ed51b3
LW
4224 }
4225 break;
a0d0e21e 4226 case DIGIT:
1aa99e6b 4227 if (do_utf8) {
ffc61ed2 4228 loceol = PL_regeol;
8269fa76 4229 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4230 while (hardcount < max && scan < loceol &&
3568d838 4231 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4232 scan += UTF8SKIP(scan);
4233 hardcount++;
4234 }
4235 } else {
4236 while (scan < loceol && isDIGIT(*scan))
4237 scan++;
a0ed51b3
LW
4238 }
4239 break;
a0d0e21e 4240 case NDIGIT:
1aa99e6b 4241 if (do_utf8) {
ffc61ed2 4242 loceol = PL_regeol;
8269fa76 4243 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4244 while (hardcount < max && scan < loceol &&
3568d838 4245 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4246 scan += UTF8SKIP(scan);
4247 hardcount++;
4248 }
4249 } else {
4250 while (scan < loceol && !isDIGIT(*scan))
4251 scan++;
a0ed51b3
LW
4252 }
4253 break;
a0d0e21e
LW
4254 default: /* Called on something of 0 width. */
4255 break; /* So match right here or not at all. */
4256 }
a687059c 4257
a0ed51b3
LW
4258 if (hardcount)
4259 c = hardcount;
4260 else
4261 c = scan - PL_reginput;
3280af22 4262 PL_reginput = scan;
a687059c 4263
9041c2e3 4264 DEBUG_r(
c277df42
IZ
4265 {
4266 SV *prop = sv_newmortal();
4267
4268 regprop(prop, p);
9041c2e3
NIS
4269 PerlIO_printf(Perl_debug_log,
4270 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4271 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4272 });
9041c2e3 4273
a0d0e21e 4274 return(c);
a687059c
LW
4275}
4276
4277/*
c277df42 4278 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4279 *
c277df42
IZ
4280 * The repeater is supposed to have constant length.
4281 */
4282
76e3520e 4283STATIC I32
cea2e8a9 4284S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4285{
b7953727 4286 register char *scan = Nullch;
c277df42 4287 register char *start;
3280af22 4288 register char *loceol = PL_regeol;
a0ed51b3 4289 I32 l = 0;
708e3b05 4290 I32 count = 0, res = 1;
a0ed51b3
LW
4291
4292 if (!max)
4293 return 0;
c277df42 4294
3280af22 4295 start = PL_reginput;
53c4c00c 4296 if (PL_reg_match_utf8) {
708e3b05 4297 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4298 if (!count++) {
4299 l = 0;
4300 while (start < PL_reginput) {
4301 l++;
4302 start += UTF8SKIP(start);
4303 }
4304 *lp = l;
4305 if (l == 0)
4306 return max;
4307 }
4308 if (count == max)
4309 return count;
4310 }
4311 }
4312 else {
708e3b05 4313 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4314 if (!count++) {
4315 *lp = l = PL_reginput - start;
4316 if (max != REG_INFTY && l*max < loceol - scan)
4317 loceol = scan + l*max;
4318 if (l == 0)
4319 return max;
c277df42
IZ
4320 }
4321 }
4322 }
708e3b05 4323 if (!res)
3280af22 4324 PL_reginput = scan;
9041c2e3 4325
a0ed51b3 4326 return count;
c277df42
IZ
4327}
4328
4329/*
ffc61ed2
JH
4330- regclass_swash - prepare the utf8 swash
4331*/
4332
4333SV *
9e55ce06 4334Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4335{
9e55ce06
JH
4336 SV *sw = NULL;
4337 SV *si = NULL;
4338 SV *alt = NULL;
ffc61ed2
JH
4339
4340 if (PL_regdata && PL_regdata->count) {
4341 U32 n = ARG(node);
4342
4343 if (PL_regdata->what[n] == 's') {
4344 SV *rv = (SV*)PL_regdata->data[n];
4345 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4346 SV **ary = AvARRAY(av);
9e55ce06 4347 SV **a, **b;
9041c2e3 4348
9e55ce06
JH
4349 /* See the end of regcomp.c:S_reglass() for
4350 * documentation of these array elements. */
4351
b11f357e
JH
4352 si = *ary;
4353 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4354 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4355
ffc61ed2
JH
4356 if (a)
4357 sw = *a;
4358 else if (si && doinit) {
4359 sw = swash_init("utf8", "", si, 1, 0);
4360 (void)av_store(av, 1, sw);
4361 }
9e55ce06
JH
4362 if (b)
4363 alt = *b;
ffc61ed2
JH
4364 }
4365 }
4366
9e55ce06
JH
4367 if (listsvp)
4368 *listsvp = si;
4369 if (altsvp)
4370 *altsvp = alt;
ffc61ed2
JH
4371
4372 return sw;
4373}
4374
4375/*
ba7b4546 4376 - reginclass - determine if a character falls into a character class
832705d4
JH
4377
4378 The n is the ANYOF regnode, the p is the target string, lenp
4379 is pointer to the maximum length of how far to go in the p
4380 (if the lenp is zero, UTF8SKIP(p) is used),
4381 do_utf8 tells whether the target string is in UTF-8.
4382
bbce6d69 4383 */
4384
76e3520e 4385STATIC bool
ba7b4546 4386S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4387{
ffc61ed2 4388 char flags = ANYOF_FLAGS(n);
bbce6d69 4389 bool match = FALSE;
cc07378b 4390 UV c = *p;
ae9ddab8 4391 STRLEN len = 0;
9e55ce06 4392 STRLEN plen;
1aa99e6b 4393
ae9ddab8
JH
4394 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4395 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4396 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4397
0f0076b4 4398 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4399 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4400 if (lenp)
4401 *lenp = 0;
ffc61ed2 4402 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4403 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4404 match = TRUE;
bbce6d69 4405 }
3568d838 4406 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4407 match = TRUE;
ffc61ed2 4408 if (!match) {
9e55ce06
JH
4409 AV *av;
4410 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4411
4412 if (sw) {
3568d838 4413 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4414 match = TRUE;
4415 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4416 if (!match && lenp && av) {
4417 I32 i;
4418
4419 for (i = 0; i <= av_len(av); i++) {
4420 SV* sv = *av_fetch(av, i, FALSE);
4421 STRLEN len;
4422 char *s = SvPV(sv, len);
4423
061b10df 4424 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4425 *lenp = len;
4426 match = TRUE;
4427 break;
4428 }
4429 }
4430 }
4431 if (!match) {
4a623e43
JH
4432 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4433 STRLEN tmplen;
4434
9e55ce06
JH
4435 to_utf8_fold(p, tmpbuf, &tmplen);
4436 if (swash_fetch(sw, tmpbuf, do_utf8))
4437 match = TRUE;
4438 }
ffc61ed2
JH
4439 }
4440 }
bbce6d69 4441 }
9e55ce06 4442 if (match && lenp && *lenp == 0)
0f0076b4 4443 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4444 }
1aa99e6b 4445 if (!match && c < 256) {
ffc61ed2
JH
4446 if (ANYOF_BITMAP_TEST(n, c))
4447 match = TRUE;
4448 else if (flags & ANYOF_FOLD) {
eb160463 4449 U8 f;
a0ed51b3 4450
ffc61ed2
JH
4451 if (flags & ANYOF_LOCALE) {
4452 PL_reg_flags |= RF_tainted;
4453 f = PL_fold_locale[c];
4454 }
4455 else
4456 f = PL_fold[c];
4457 if (f != c && ANYOF_BITMAP_TEST(n, f))
4458 match = TRUE;
4459 }
4460
4461 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4462 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4463 if (
4464 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4481 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4482 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4483 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4484 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4485 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4486 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4487 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4488 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4489 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4490 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4491 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4492 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4493 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4494 ) /* How's that for a conditional? */
4495 {
4496 match = TRUE;
4497 }
a0ed51b3 4498 }
a0ed51b3
LW
4499 }
4500
a0ed51b3
LW
4501 return (flags & ANYOF_INVERT) ? !match : match;
4502}
161b471a 4503
dfe13c55 4504STATIC U8 *
cea2e8a9 4505S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4506{
1aa99e6b
IH
4507 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4508}
4509
4510STATIC U8 *
4511S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4512{
a0ed51b3 4513 if (off >= 0) {
1aa99e6b 4514 while (off-- && s < lim) {
ffc61ed2 4515 /* XXX could check well-formedness here */
a0ed51b3 4516 s += UTF8SKIP(s);
ffc61ed2 4517 }
a0ed51b3
LW
4518 }
4519 else {
4520 while (off++) {
1aa99e6b 4521 if (s > lim) {
a0ed51b3 4522 s--;
ffc61ed2 4523 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4524 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4525 s--;
ffc61ed2
JH
4526 }
4527 /* XXX could check well-formedness here */
a0ed51b3
LW
4528 }
4529 }
4530 }
4531 return s;
4532}
161b471a 4533
dfe13c55 4534STATIC U8 *
1aa99e6b 4535S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4536{
1aa99e6b
IH
4537 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4538}
4539
4540STATIC U8 *
4541S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4542{
4543 if (off >= 0) {
1aa99e6b 4544 while (off-- && s < lim) {
ffc61ed2 4545 /* XXX could check well-formedness here */
a0ed51b3 4546 s += UTF8SKIP(s);
ffc61ed2 4547 }
a0ed51b3
LW
4548 if (off >= 0)
4549 return 0;
4550 }
4551 else {
4552 while (off++) {
1aa99e6b 4553 if (s > lim) {
a0ed51b3 4554 s--;
ffc61ed2 4555 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4556 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4557 s--;
ffc61ed2
JH
4558 }
4559 /* XXX could check well-formedness here */
a0ed51b3
LW
4560 }
4561 else
4562 break;
4563 }
4564 if (off <= 0)
4565 return 0;
4566 }
4567 return s;
4568}
51371543 4569
51371543 4570static void
acfe0abc 4571restore_pos(pTHX_ void *arg)
51371543 4572{
51371543
GS
4573 if (PL_reg_eval_set) {
4574 if (PL_reg_oldsaved) {
4575 PL_reg_re->subbeg = PL_reg_oldsaved;
4576 PL_reg_re->sublen = PL_reg_oldsavedlen;
ed252734
NC
4577#ifdef PERL_COPY_ON_WRITE
4578 PL_reg_re->saved_copy = PL_nrs;
4579#endif
51371543
GS
4580 RX_MATCH_COPIED_on(PL_reg_re);
4581 }
4582 PL_reg_magic->mg_len = PL_reg_oldpos;
4583 PL_reg_eval_set = 0;
4584 PL_curpm = PL_reg_oldcurpm;
4585 }
4586}
33b8afdf
JH
4587
4588STATIC void
4589S_to_utf8_substr(pTHX_ register regexp *prog)
4590{
4591 SV* sv;
4592 if (prog->float_substr && !prog->float_utf8) {
4593 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4594 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4595 sv_utf8_upgrade(sv);
4596 if (SvTAIL(prog->float_substr))
4597 SvTAIL_on(sv);
4598 if (prog->float_substr == prog->check_substr)
4599 prog->check_utf8 = sv;
4600 }
4601 if (prog->anchored_substr && !prog->anchored_utf8) {
4602 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4603 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4604 sv_utf8_upgrade(sv);
4605 if (SvTAIL(prog->anchored_substr))
4606 SvTAIL_on(sv);
4607 if (prog->anchored_substr == prog->check_substr)
4608 prog->check_utf8 = sv;
4609 }
4610}
4611
4612STATIC void
4613S_to_byte_substr(pTHX_ register regexp *prog)
4614{
4615 SV* sv;
4616 if (prog->float_utf8 && !prog->float_substr) {
4617 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4618 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4619 if (sv_utf8_downgrade(sv, TRUE)) {
4620 if (SvTAIL(prog->float_utf8))
4621 SvTAIL_on(sv);
4622 } else {
4623 SvREFCNT_dec(sv);
4624 prog->float_substr = sv = &PL_sv_undef;
4625 }
4626 if (prog->float_utf8 == prog->check_utf8)
4627 prog->check_substr = sv;
4628 }
4629 if (prog->anchored_utf8 && !prog->anchored_substr) {
4630 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4631 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4632 if (sv_utf8_downgrade(sv, TRUE)) {
4633 if (SvTAIL(prog->anchored_utf8))
4634 SvTAIL_on(sv);
4635 } else {
4636 SvREFCNT_dec(sv);
4637 prog->anchored_substr = sv = &PL_sv_undef;
4638 }
4639 if (prog->anchored_utf8 == prog->check_utf8)
4640 prog->check_substr = sv;
4641 }
4642}