This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #3096] undefing hash with object values
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
4bb101f2
JH
70 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
71 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 72 ****
9ef589d8
LW
73 **** You may distribute under the terms of either the GNU General Public
74 **** License or the Artistic License, as specified in the README file.
a687059c
LW
75 *
76 * Beware that some of this code is subtly aware of the way operator
77 * precedence is structured in regular expressions. Serious changes in
78 * regular-expression syntax might require a total rethink.
79 */
80#include "EXTERN.h"
864dbfa3 81#define PERL_IN_REGEXEC_C
a687059c 82#include "perl.h"
0f5d15d6 83
a687059c
LW
84#include "regcomp.h"
85
c277df42
IZ
86#define RF_tainted 1 /* tainted information used? */
87#define RF_warned 2 /* warned about big count? */
ce862d02 88#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
89#define RF_utf8 8 /* String contains multibyte chars? */
90
eb160463 91#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
92
93#define RS_init 1 /* eval environment created */
94#define RS_set 2 /* replsv value is set */
c277df42 95
a687059c
LW
96#ifndef STATIC
97#define STATIC static
98#endif
99
ba7b4546 100#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 101
c277df42
IZ
102/*
103 * Forwards.
104 */
105
33b8afdf 106#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 107#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 108
dfe13c55
GS
109#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
110#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
111#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
112#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
113#define HOPc(pos,off) ((char*)HOP(pos,off))
114#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 115
efb30f32 116#define HOPBACK(pos, off) ( \
e54858b0 117 (PL_reg_match_utf8) \
efb30f32
HS
118 ? reghopmaybe((U8*)pos, -off) \
119 : (pos - off >= PL_bostr) \
120 ? (U8*)(pos - off) \
121 : (U8*)NULL \
122)
123#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
124
1aa99e6b
IH
125#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
126#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
127#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
128#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
129#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
130#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
131
977e4b84 132#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 133
5f80c4cf 134/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
135#define JUMPABLE(rn) ( \
136 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
137 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
138 OP(rn) == PLUS || OP(rn) == MINMOD || \
139 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
140)
141
cca55fe3
JP
142#define HAS_TEXT(rn) ( \
143 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
144)
e2d8ce26 145
a84d97b6
HS
146/*
147 Search for mandatory following text node; for lookahead, the text must
148 follow but for lookbehind (rn->flags != 0) we skip to the next step.
149*/
cca55fe3 150#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 151 while (JUMPABLE(rn)) \
a84d97b6 152 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 153 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
154 else if (OP(rn) == PLUS) \
155 rn = NEXTOPER(rn); \
a84d97b6
HS
156 else if (OP(rn) == IFMATCH) \
157 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 158 else rn += NEXT_OFF(rn); \
5f80c4cf 159} STMT_END
74750237 160
acfe0abc 161static void restore_pos(pTHX_ void *arg);
51371543 162
76e3520e 163STATIC CHECKPOINT
cea2e8a9 164S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 165{
3280af22 166 int retval = PL_savestack_ix;
b1ce53c5
JH
167#define REGCP_PAREN_ELEMS 4
168 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
169 int p;
170
e49a9654
IH
171 if (paren_elems_to_push < 0)
172 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
173
a01268b5 174#define REGCP_OTHER_ELEMS 6
b1ce53c5 175 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 176 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 177/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
178 SSPUSHINT(PL_regendp[p]);
179 SSPUSHINT(PL_regstartp[p]);
3280af22 180 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
181 SSPUSHINT(p);
182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
02db2b7b 198# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 199 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 200 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 201
02db2b7b 202# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 203 PerlIO_printf(Perl_debug_log, \
faccc32b 204 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 205 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 206
76e3520e 207STATIC char *
cea2e8a9 208S_regcppop(pTHX)
a0d0e21e 209{
b1ce53c5 210 I32 i;
a0d0e21e
LW
211 U32 paren = 0;
212 char *input;
cf93c79d 213 I32 tmps;
b1ce53c5
JH
214
215 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 216 i = SSPOPINT;
b1ce53c5
JH
217 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
218 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 219 input = (char *) SSPOPPTR;
a01268b5 220 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
221 *PL_reglastparen = SSPOPINT;
222 PL_regsize = SSPOPINT;
b1ce53c5
JH
223
224 /* Now restore the parentheses context. */
41123dfd
JH
225 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
226 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 227 paren = (U32)SSPOPINT;
3280af22 228 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
229 PL_regstartp[paren] = SSPOPINT;
230 tmps = SSPOPINT;
3280af22
NIS
231 if (paren <= *PL_reglastparen)
232 PL_regendp[paren] = tmps;
c277df42 233 DEBUG_r(
c3464db5 234 PerlIO_printf(Perl_debug_log,
b900a521 235 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 236 (UV)paren, (IV)PL_regstartp[paren],
b900a521 237 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 238 (IV)PL_regendp[paren],
3280af22 239 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 240 );
a0d0e21e 241 }
c277df42 242 DEBUG_r(
eb160463 243 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 244 PerlIO_printf(Perl_debug_log,
faccc32b
JH
245 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
246 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
247 }
248 );
daf18116 249#if 1
dafc8851
JH
250 /* It would seem that the similar code in regtry()
251 * already takes care of this, and in fact it is in
252 * a better location to since this code can #if 0-ed out
253 * but the code in regtry() is needed or otherwise tests
254 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
255 * (as of patchlevel 7877) will fail. Then again,
256 * this code seems to be necessary or otherwise
257 * building DynaLoader will fail:
258 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
259 * --jhi */
eb160463
GS
260 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
261 if ((I32)paren > PL_regsize)
cf93c79d
IZ
262 PL_regstartp[paren] = -1;
263 PL_regendp[paren] = -1;
a0d0e21e 264 }
dafc8851 265#endif
a0d0e21e
LW
266 return input;
267}
268
0f5d15d6 269STATIC char *
cea2e8a9 270S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
271{
272 I32 tmp = PL_savestack_ix;
273
274 PL_savestack_ix = ss;
275 regcppop();
276 PL_savestack_ix = tmp;
942e002e 277 return Nullch;
0f5d15d6
IZ
278}
279
280typedef struct re_cc_state
281{
282 I32 ss;
283 regnode *node;
284 struct re_cc_state *prev;
285 CURCUR *cc;
286 regexp *re;
287} re_cc_state;
288
02db2b7b 289#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 290
29d1e993
HS
291#define TRYPAREN(paren, n, input) { \
292 if (paren) { \
293 if (n) { \
294 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
295 PL_regendp[paren] = input - PL_bostr; \
296 } \
297 else \
298 PL_regendp[paren] = -1; \
299 } \
300 if (regmatch(next)) \
301 sayYES; \
302 if (paren && n) \
303 PL_regendp[paren] = -1; \
304}
305
306
a687059c 307/*
e50aee73 308 * pregexec and friends
a687059c
LW
309 */
310
311/*
c277df42 312 - pregexec - match a regexp against a string
a687059c 313 */
c277df42 314I32
864dbfa3 315Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 316 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
317/* strend: pointer to null at end of string */
318/* strbeg: real beginning of string */
319/* minend: end of match must be >=minend after stringarg. */
320/* nosave: For optimizations. */
321{
322 return
9041c2e3 323 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
324 nosave ? 0 : REXEC_COPY_STR);
325}
0f5d15d6
IZ
326
327STATIC void
cea2e8a9 328S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
329{
330 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
331#ifdef DEBUGGING
332 PL_regprogram = prog->program;
333#endif
334 PL_regnpar = prog->nparens;
9041c2e3
NIS
335 PL_regdata = prog->data;
336 PL_reg_re = prog;
0f5d15d6 337}
22e551b9 338
9041c2e3 339/*
cad2e5aa
JH
340 * Need to implement the following flags for reg_anch:
341 *
342 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
343 * USE_INTUIT_ML
344 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
345 * INTUIT_AUTORITATIVE_ML
346 * INTUIT_ONCE_NOML - Intuit can match in one location only.
347 * INTUIT_ONCE_ML
348 *
349 * Another flag for this function: SECOND_TIME (so that float substrs
350 * with giant delta may be not rechecked).
351 */
352
353/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
354
2c2d71f5 355/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
356 Otherwise, only SvCUR(sv) is used to get strbeg. */
357
358/* XXXX We assume that strpos is strbeg unless sv. */
359
6eb5f6b9
JH
360/* XXXX Some places assume that there is a fixed substring.
361 An update may be needed if optimizer marks as "INTUITable"
362 RExen without fixed substrings. Similarly, it is assumed that
363 lengths of all the strings are no more than minlen, thus they
364 cannot come from lookahead.
365 (Or minlen should take into account lookahead.) */
366
2c2d71f5
JH
367/* A failure to find a constant substring means that there is no need to make
368 an expensive call to REx engine, thus we celebrate a failure. Similarly,
369 finding a substring too deep into the string means that less calls to
30944b6d
IZ
370 regtry() should be needed.
371
372 REx compiler's optimizer found 4 possible hints:
373 a) Anchored substring;
374 b) Fixed substring;
375 c) Whether we are anchored (beginning-of-line or \G);
376 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 377 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
378 string which does not contradict any of them.
379 */
2c2d71f5 380
6eb5f6b9
JH
381/* Most of decisions we do here should have been done at compile time.
382 The nodes of the REx which we used for the search should have been
383 deleted from the finite automaton. */
384
cad2e5aa
JH
385char *
386Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
387 char *strend, U32 flags, re_scream_pos_data *data)
388{
b7953727 389 register I32 start_shift = 0;
cad2e5aa 390 /* Should be nonnegative! */
b7953727 391 register I32 end_shift = 0;
2c2d71f5
JH
392 register char *s;
393 register SV *check;
a1933d95 394 char *strbeg;
cad2e5aa 395 char *t;
33b8afdf 396 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 397 I32 ml_anch;
6eb5f6b9 398 register char *other_last = Nullch; /* other substr checked before this */
b7953727 399 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
400#ifdef DEBUGGING
401 char *i_strpos = strpos;
ce333219 402 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 403#endif
a30b2f1f 404 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 405
b8d68ded
JH
406 if (prog->reganch & ROPT_UTF8) {
407 DEBUG_r(PerlIO_printf(Perl_debug_log,
408 "UTF-8 regex...\n"));
409 PL_reg_flags |= RF_utf8;
410 }
411
2a782b5b 412 DEBUG_r({
b8d68ded 413 char *s = PL_reg_match_utf8 ?
c728cb41
JH
414 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
415 strpos;
b8d68ded
JH
416 int len = PL_reg_match_utf8 ?
417 strlen(s) : strend - strpos;
2a782b5b
JH
418 if (!PL_colorset)
419 reginitcolors();
b8d68ded
JH
420 if (PL_reg_match_utf8)
421 DEBUG_r(PerlIO_printf(Perl_debug_log,
422 "UTF-8 target...\n"));
2a782b5b
JH
423 PerlIO_printf(Perl_debug_log,
424 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
425 PL_colors[4],PL_colors[5],PL_colors[0],
426 prog->precomp,
427 PL_colors[1],
428 (strlen(prog->precomp) > 60 ? "..." : ""),
429 PL_colors[0],
430 (int)(len > 60 ? 60 : len),
431 s, PL_colors[1],
432 (len > 60 ? "..." : "")
433 );
434 });
cad2e5aa 435
c344f387
JH
436 /* CHR_DIST() would be more correct here but it makes things slow. */
437 if (prog->minlen > strend - strpos) {
a72c7584
JH
438 DEBUG_r(PerlIO_printf(Perl_debug_log,
439 "String too short... [re_intuit_start]\n"));
cad2e5aa 440 goto fail;
2c2d71f5 441 }
a1933d95 442 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 443 PL_regeol = strend;
33b8afdf
JH
444 if (do_utf8) {
445 if (!prog->check_utf8 && prog->check_substr)
446 to_utf8_substr(prog);
447 check = prog->check_utf8;
448 } else {
449 if (!prog->check_substr && prog->check_utf8)
450 to_byte_substr(prog);
451 check = prog->check_substr;
452 }
453 if (check == &PL_sv_undef) {
454 DEBUG_r(PerlIO_printf(Perl_debug_log,
455 "Non-utf string cannot match utf check string\n"));
456 goto fail;
457 }
2c2d71f5 458 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
459 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
460 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 461 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 462
7e25d62c
JH
463 if (!ml_anch) {
464 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
465 | ROPT_IMPLICIT)) /* not a real BOL */
466 /* SvCUR is not set on references: SvRV and SvPVX overlap */
467 && sv && !SvROK(sv)
468 && (strpos != strbeg)) {
469 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
470 goto fail;
471 }
472 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 473 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 474 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
475 I32 slen;
476
1aa99e6b 477 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
478 if (SvTAIL(check)) {
479 slen = SvCUR(check); /* >= 1 */
cad2e5aa 480
9041c2e3 481 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
482 || (strend - s == slen && strend[-1] != '\n')) {
483 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
484 goto fail_finish;
cad2e5aa
JH
485 }
486 /* Now should match s[0..slen-2] */
487 slen--;
653099ff 488 if (slen && (*SvPVX(check) != *s
cad2e5aa 489 || (slen > 1
653099ff 490 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
491 report_neq:
492 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
493 goto fail_finish;
494 }
cad2e5aa 495 }
653099ff
GS
496 else if (*SvPVX(check) != *s
497 || ((slen = SvCUR(check)) > 1
498 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
499 goto report_neq;
500 goto success_at_start;
7e25d62c 501 }
cad2e5aa 502 }
2c2d71f5 503 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 504 s = strpos;
2c2d71f5 505 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 506 end_shift = prog->minlen - start_shift -
653099ff 507 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 508 if (!ml_anch) {
653099ff
GS
509 I32 end = prog->check_offset_max + CHR_SVLEN(check)
510 - (SvTAIL(check) != 0);
1aa99e6b 511 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
512
513 if (end_shift < eshift)
514 end_shift = eshift;
515 }
cad2e5aa 516 }
2c2d71f5 517 else { /* Can match at random position */
cad2e5aa
JH
518 ml_anch = 0;
519 s = strpos;
2c2d71f5
JH
520 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
521 /* Should be nonnegative! */
522 end_shift = prog->minlen - start_shift -
653099ff 523 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
524 }
525
2c2d71f5 526#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 527 if (end_shift < 0)
6bbae5e6 528 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
529#endif
530
2c2d71f5
JH
531 restart:
532 /* Find a possible match in the region s..strend by looking for
533 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 534 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
535 I32 p = -1; /* Internal iterator of scream. */
536 I32 *pp = data ? data->scream_pos : &p;
537
2c2d71f5
JH
538 if (PL_screamfirst[BmRARE(check)] >= 0
539 || ( BmRARE(check) == '\n'
540 && (BmPREVIOUS(check) == SvCUR(check) - 1)
541 && SvTAIL(check) ))
9041c2e3 542 s = screaminstr(sv, check,
2c2d71f5 543 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 544 else
2c2d71f5 545 goto fail_finish;
4addbd3b
HS
546 /* we may be pointing at the wrong string */
547 if (s && RX_MATCH_COPIED(prog))
7ef91622 548 s = strbeg + (s - SvPVX(sv));
cad2e5aa
JH
549 if (data)
550 *data->scream_olds = s;
551 }
f33976b4 552 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
553 s = fbm_instr((U8*)(s + start_shift),
554 (U8*)(strend - end_shift),
555 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 556 else
1aa99e6b
IH
557 s = fbm_instr(HOP3(s, start_shift, strend),
558 HOP3(strend, -end_shift, strbeg),
2c2d71f5 559 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
560
561 /* Update the count-of-usability, remove useless subpatterns,
562 unshift s. */
2c2d71f5
JH
563
564 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
565 (s ? "Found" : "Did not find"),
33b8afdf 566 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 567 PL_colors[0],
7b0972df
JH
568 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
569 SvPVX(check),
2c2d71f5
JH
570 PL_colors[1], (SvTAIL(check) ? "$" : ""),
571 (s ? " at offset " : "...\n") ) );
572
573 if (!s)
574 goto fail_finish;
575
6eb5f6b9
JH
576 check_at = s;
577
2c2d71f5 578 /* Finish the diagnostic message */
30944b6d 579 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
580
581 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
582 Start with the other substr.
583 XXXX no SCREAM optimization yet - and a very coarse implementation
584 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
585 *always* match. Probably should be marked during compile...
586 Probably it is right to do no SCREAM here...
587 */
588
33b8afdf 589 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 590 /* Take into account the "other" substring. */
2c2d71f5
JH
591 /* XXXX May be hopelessly wrong for UTF... */
592 if (!other_last)
6eb5f6b9 593 other_last = strpos;
33b8afdf 594 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
595 do_other_anchored:
596 {
1aa99e6b 597 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 598 char *s1 = s;
33b8afdf 599 SV* must;
2c2d71f5 600
2c2d71f5
JH
601 t = s - prog->check_offset_max;
602 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 603 && (!do_utf8
1aa99e6b 604 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 605 && t > strpos)))
30944b6d 606 /* EMPTY */;
2c2d71f5
JH
607 else
608 t = strpos;
1aa99e6b 609 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
610 if (t < other_last) /* These positions already checked */
611 t = other_last;
1aa99e6b 612 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
613 if (last < last1)
614 last1 = last;
615 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
616 /* On end-of-str: see comment below. */
33b8afdf
JH
617 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
618 if (must == &PL_sv_undef) {
619 s = (char*)NULL;
620 DEBUG_r(must = prog->anchored_utf8); /* for debug */
621 }
622 else
623 s = fbm_instr(
624 (unsigned char*)t,
625 HOP3(HOP3(last1, prog->anchored_offset, strend)
626 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
627 must,
628 PL_multiline ? FBMrf_MULTILINE : 0
629 );
1aa99e6b
IH
630 DEBUG_r(PerlIO_printf(Perl_debug_log,
631 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
632 (s ? "Found" : "Contradicts"),
633 PL_colors[0],
33b8afdf
JH
634 (int)(SvCUR(must)
635 - (SvTAIL(must)!=0)),
636 SvPVX(must),
637 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
638 if (!s) {
639 if (last1 >= last2) {
640 DEBUG_r(PerlIO_printf(Perl_debug_log,
641 ", giving up...\n"));
642 goto fail_finish;
643 }
644 DEBUG_r(PerlIO_printf(Perl_debug_log,
645 ", trying floating at offset %ld...\n",
1aa99e6b
IH
646 (long)(HOP3c(s1, 1, strend) - i_strpos)));
647 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
648 s = HOP3c(last, 1, strend);
2c2d71f5
JH
649 goto restart;
650 }
651 else {
652 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 653 (long)(s - i_strpos)));
1aa99e6b
IH
654 t = HOP3c(s, -prog->anchored_offset, strbeg);
655 other_last = HOP3c(s, 1, strend);
30944b6d 656 s = s1;
2c2d71f5
JH
657 if (t == strpos)
658 goto try_at_start;
2c2d71f5
JH
659 goto try_at_offset;
660 }
30944b6d 661 }
2c2d71f5
JH
662 }
663 else { /* Take into account the floating substring. */
33b8afdf
JH
664 char *last, *last1;
665 char *s1 = s;
666 SV* must;
667
668 t = HOP3c(s, -start_shift, strbeg);
669 last1 = last =
670 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
671 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
672 last = HOP3c(t, prog->float_max_offset, strend);
673 s = HOP3c(t, prog->float_min_offset, strend);
674 if (s < other_last)
675 s = other_last;
2c2d71f5 676 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
677 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
678 /* fbm_instr() takes into account exact value of end-of-str
679 if the check is SvTAIL(ed). Since false positives are OK,
680 and end-of-str is not later than strend we are OK. */
681 if (must == &PL_sv_undef) {
682 s = (char*)NULL;
683 DEBUG_r(must = prog->float_utf8); /* for debug message */
684 }
685 else
2c2d71f5 686 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
687 (unsigned char*)last + SvCUR(must)
688 - (SvTAIL(must)!=0),
689 must, PL_multiline ? FBMrf_MULTILINE : 0);
690 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
691 (s ? "Found" : "Contradicts"),
692 PL_colors[0],
693 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
694 SvPVX(must),
695 PL_colors[1], (SvTAIL(must) ? "$" : "")));
696 if (!s) {
697 if (last1 == last) {
2c2d71f5 698 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
699 ", giving up...\n"));
700 goto fail_finish;
2c2d71f5 701 }
33b8afdf
JH
702 DEBUG_r(PerlIO_printf(Perl_debug_log,
703 ", trying anchored starting at offset %ld...\n",
704 (long)(s1 + 1 - i_strpos)));
705 other_last = last;
706 s = HOP3c(t, 1, strend);
707 goto restart;
708 }
709 else {
710 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
711 (long)(s - i_strpos)));
712 other_last = s; /* Fix this later. --Hugo */
713 s = s1;
714 if (t == strpos)
715 goto try_at_start;
716 goto try_at_offset;
717 }
2c2d71f5 718 }
cad2e5aa 719 }
2c2d71f5
JH
720
721 t = s - prog->check_offset_max;
2c2d71f5 722 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 723 && (!do_utf8
1aa99e6b
IH
724 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
725 && t > strpos))) {
2c2d71f5
JH
726 /* Fixed substring is found far enough so that the match
727 cannot start at strpos. */
728 try_at_offset:
cad2e5aa 729 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
730 /* Eventually fbm_*() should handle this, but often
731 anchored_offset is not 0, so this check will not be wasted. */
732 /* XXXX In the code below we prefer to look for "^" even in
733 presence of anchored substrings. And we search even
734 beyond the found float position. These pessimizations
735 are historical artefacts only. */
736 find_anchor:
2c2d71f5 737 while (t < strend - prog->minlen) {
cad2e5aa 738 if (*t == '\n') {
4ee3650e 739 if (t < check_at - prog->check_offset_min) {
33b8afdf 740 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
741 /* Since we moved from the found position,
742 we definitely contradict the found anchored
30944b6d
IZ
743 substr. Due to the above check we do not
744 contradict "check" substr.
745 Thus we can arrive here only if check substr
746 is float. Redo checking for "other"=="fixed".
747 */
9041c2e3 748 strpos = t + 1;
30944b6d
IZ
749 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
750 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
751 goto do_other_anchored;
752 }
4ee3650e
GS
753 /* We don't contradict the found floating substring. */
754 /* XXXX Why not check for STCLASS? */
cad2e5aa 755 s = t + 1;
2c2d71f5 756 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 757 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
758 goto set_useful;
759 }
4ee3650e
GS
760 /* Position contradicts check-string */
761 /* XXXX probably better to look for check-string
762 than for "\n", so one should lower the limit for t? */
763 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 764 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 765 other_last = strpos = s = t + 1;
cad2e5aa
JH
766 goto restart;
767 }
768 t++;
769 }
2c2d71f5
JH
770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
771 PL_colors[0],PL_colors[1]));
772 goto fail_finish;
cad2e5aa 773 }
f5952150
GS
774 else {
775 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
776 PL_colors[0],PL_colors[1]));
777 }
cad2e5aa
JH
778 s = t;
779 set_useful:
33b8afdf 780 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
781 }
782 else {
f5952150 783 /* The found string does not prohibit matching at strpos,
2c2d71f5 784 - no optimization of calling REx engine can be performed,
f5952150
GS
785 unless it was an MBOL and we are not after MBOL,
786 or a future STCLASS check will fail this. */
2c2d71f5
JH
787 try_at_start:
788 /* Even in this situation we may use MBOL flag if strpos is offset
789 wrt the start of the string. */
05b4157f 790 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 791 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
792 /* May be due to an implicit anchor of m{.*foo} */
793 && !(prog->reganch & ROPT_IMPLICIT))
794 {
cad2e5aa
JH
795 t = strpos;
796 goto find_anchor;
797 }
30944b6d 798 DEBUG_r( if (ml_anch)
f5952150
GS
799 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
800 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 801 );
2c2d71f5 802 success_at_start:
30944b6d 803 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
804 && (do_utf8 ? (
805 prog->check_utf8 /* Could be deleted already */
806 && --BmUSEFUL(prog->check_utf8) < 0
807 && (prog->check_utf8 == prog->float_utf8)
808 ) : (
809 prog->check_substr /* Could be deleted already */
810 && --BmUSEFUL(prog->check_substr) < 0
811 && (prog->check_substr == prog->float_substr)
812 )))
66e933ab 813 {
cad2e5aa 814 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 815 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
816 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
817 if (do_utf8 ? prog->check_substr : prog->check_utf8)
818 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
819 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
820 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 821 check = Nullsv; /* abort */
cad2e5aa 822 s = strpos;
3cf5c195
IZ
823 /* XXXX This is a remnant of the old implementation. It
824 looks wasteful, since now INTUIT can use many
6eb5f6b9 825 other heuristics. */
cad2e5aa
JH
826 prog->reganch &= ~RE_USE_INTUIT;
827 }
828 else
829 s = strpos;
830 }
831
6eb5f6b9
JH
832 /* Last resort... */
833 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
834 if (prog->regstclass) {
835 /* minlen == 0 is possible if regstclass is \b or \B,
836 and the fixed substr is ''$.
837 Since minlen is already taken into account, s+1 is before strend;
838 accidentally, minlen >= 1 guaranties no false positives at s + 1
839 even for \b or \B. But (minlen? 1 : 0) below assumes that
840 regstclass does not come from lookahead... */
841 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
842 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 843 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 844 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 845 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 846 : 1);
33b8afdf 847 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 848 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 849 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
850 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
851 cl_l, strend)
852 : strend);
a1933d95 853 char *startpos = strbeg;
6eb5f6b9
JH
854
855 t = s;
9926ca43 856 cache_re(prog);
f33976b4 857 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
858 if (!s) {
859#ifdef DEBUGGING
b7953727 860 char *what = 0;
6eb5f6b9
JH
861#endif
862 if (endpos == strend) {
863 DEBUG_r( PerlIO_printf(Perl_debug_log,
864 "Could not match STCLASS...\n") );
865 goto fail;
866 }
66e933ab
GS
867 DEBUG_r( PerlIO_printf(Perl_debug_log,
868 "This position contradicts STCLASS...\n") );
653099ff
GS
869 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
870 goto fail;
6eb5f6b9 871 /* Contradict one of substrings */
33b8afdf
JH
872 if (prog->anchored_substr || prog->anchored_utf8) {
873 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
874 DEBUG_r( what = "anchored" );
875 hop_and_restart:
1aa99e6b 876 s = HOP3c(t, 1, strend);
66e933ab
GS
877 if (s + start_shift + end_shift > strend) {
878 /* XXXX Should be taken into account earlier? */
879 DEBUG_r( PerlIO_printf(Perl_debug_log,
880 "Could not match STCLASS...\n") );
881 goto fail;
882 }
5e39e1e5
HS
883 if (!check)
884 goto giveup;
6eb5f6b9 885 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 886 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
887 what, (long)(s + start_shift - i_strpos)) );
888 goto restart;
889 }
66e933ab 890 /* Have both, check_string is floating */
6eb5f6b9
JH
891 if (t + start_shift >= check_at) /* Contradicts floating=check */
892 goto retry_floating_check;
893 /* Recheck anchored substring, but not floating... */
9041c2e3 894 s = check_at;
5e39e1e5
HS
895 if (!check)
896 goto giveup;
6eb5f6b9 897 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 898 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
899 (long)(other_last - i_strpos)) );
900 goto do_other_anchored;
901 }
60e71179
GS
902 /* Another way we could have checked stclass at the
903 current position only: */
904 if (ml_anch) {
905 s = t = t + 1;
5e39e1e5
HS
906 if (!check)
907 goto giveup;
60e71179 908 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
909 "Looking for /%s^%s/m starting at offset %ld...\n",
910 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 911 goto try_at_offset;
66e933ab 912 }
33b8afdf 913 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 914 goto fail;
6eb5f6b9
JH
915 /* Check is floating subtring. */
916 retry_floating_check:
917 t = check_at - start_shift;
918 DEBUG_r( what = "floating" );
919 goto hop_and_restart;
920 }
b7953727
JH
921 if (t != s) {
922 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 923 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
924 (long)(t - i_strpos), (long)(s - i_strpos))
925 );
926 }
927 else {
928 DEBUG_r(PerlIO_printf(Perl_debug_log,
929 "Does not contradict STCLASS...\n");
930 );
931 }
6eb5f6b9 932 }
5e39e1e5
HS
933 giveup:
934 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
935 PL_colors[4], (check ? "Guessed" : "Giving up"),
936 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 937 return s;
2c2d71f5
JH
938
939 fail_finish: /* Substring not found */
33b8afdf
JH
940 if (prog->check_substr || prog->check_utf8) /* could be removed already */
941 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 942 fail:
2c2d71f5 943 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
944 PL_colors[4],PL_colors[5]));
945 return Nullch;
946}
9661b544 947
6eb5f6b9 948/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
949STATIC char *
950S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 951{
6eb5f6b9
JH
952 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
953 char *m;
d8093b23
G
954 STRLEN ln;
955 unsigned int c1;
956 unsigned int c2;
6eb5f6b9
JH
957 char *e;
958 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 959 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 960
6eb5f6b9
JH
961 /* We know what class it must start with. */
962 switch (OP(c)) {
6eb5f6b9 963 case ANYOF:
388cc4de
HS
964 if (do_utf8) {
965 while (s < strend) {
966 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
967 !UTF8_IS_INVARIANT((U8)s[0]) ?
968 reginclass(c, (U8*)s, 0, do_utf8) :
969 REGINCLASS(c, (U8*)s)) {
970 if (tmp && (norun || regtry(prog, s)))
971 goto got_it;
972 else
973 tmp = doevery;
974 }
975 else
976 tmp = 1;
977 s += UTF8SKIP(s);
978 }
979 }
980 else {
981 while (s < strend) {
982 STRLEN skip = 1;
983
984 if (REGINCLASS(c, (U8*)s) ||
985 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
986 /* The assignment of 2 is intentional:
987 * for the folded sharp s, the skip is 2. */
988 (skip = SHARP_S_SKIP))) {
989 if (tmp && (norun || regtry(prog, s)))
990 goto got_it;
991 else
992 tmp = doevery;
993 }
994 else
995 tmp = 1;
996 s += skip;
997 }
a0d0e21e 998 }
6eb5f6b9 999 break;
f33976b4
DB
1000 case CANY:
1001 while (s < strend) {
1002 if (tmp && (norun || regtry(prog, s)))
1003 goto got_it;
1004 else
1005 tmp = doevery;
1006 s++;
1007 }
1008 break;
6eb5f6b9
JH
1009 case EXACTF:
1010 m = STRING(c);
1011 ln = STR_LEN(c);
1aa99e6b 1012 if (UTF) {
a2a2844f 1013 STRLEN ulen1, ulen2;
e7ae6809
JH
1014 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1015 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1016
1017 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1018 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1019
872c91ae
JH
1020 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1021 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1022 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1023 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1024 }
1025 else {
1026 c1 = *(U8*)m;
1027 c2 = PL_fold[c1];
1028 }
6eb5f6b9
JH
1029 goto do_exactf;
1030 case EXACTFL:
1031 m = STRING(c);
1032 ln = STR_LEN(c);
d8093b23 1033 c1 = *(U8*)m;
6eb5f6b9
JH
1034 c2 = PL_fold_locale[c1];
1035 do_exactf:
6795a346 1036 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1037
6eb5f6b9
JH
1038 if (norun && e < s)
1039 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1040
60a8b682
JH
1041 /* The idea in the EXACTF* cases is to first find the
1042 * first character of the EXACTF* node and then, if
1043 * necessary, case-insensitively compare the full
1044 * text of the node. The c1 and c2 are the first
1045 * characters (though in Unicode it gets a bit
1046 * more complicated because there are more cases
7f16dd3d
JH
1047 * than just upper and lower: one needs to use
1048 * the so-called folding case for case-insensitive
1049 * matching (called "loose matching" in Unicode).
1050 * ibcmp_utf8() will do just that. */
60a8b682 1051
1aa99e6b 1052 if (do_utf8) {
575cac57
JH
1053 UV c, f;
1054 U8 tmpbuf [UTF8_MAXLEN+1];
1055 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1056 STRLEN len, foldlen;
d7f013c8 1057
09091399 1058 if (c1 == c2) {
1aa99e6b 1059 while (s <= e) {
872c91ae
JH
1060 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1061 ckWARN(WARN_UTF8) ?
1062 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1063 if ( c == c1
1064 && (ln == len ||
66423254 1065 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1066 m, (char **)0, ln, (bool)UTF))
55da9344 1067 && (norun || regtry(prog, s)) )
1aa99e6b 1068 goto got_it;
80aecb99
JH
1069 else {
1070 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1071 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1072 if ( f != c
1073 && (f == c1 || f == c2)
1074 && (ln == foldlen ||
66423254
JH
1075 !ibcmp_utf8((char *) foldbuf,
1076 (char **)0, foldlen, do_utf8,
d07ddd77 1077 m,
eb160463 1078 (char **)0, ln, (bool)UTF))
80aecb99
JH
1079 && (norun || regtry(prog, s)) )
1080 goto got_it;
1081 }
1aa99e6b
IH
1082 s += len;
1083 }
09091399
JH
1084 }
1085 else {
1aa99e6b 1086 while (s <= e) {
872c91ae
JH
1087 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1088 ckWARN(WARN_UTF8) ?
1089 0 : UTF8_ALLOW_ANY);
80aecb99 1090
60a8b682 1091 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1092 * Note that not all the possible combinations
1093 * are handled here: some of them are handled
1094 * by the standard folding rules, and some of
1095 * them (the character class or ANYOF cases)
1096 * are handled during compiletime in
1097 * regexec.c:S_regclass(). */
880bd946
JH
1098 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1099 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1100 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1101
1102 if ( (c == c1 || c == c2)
1103 && (ln == len ||
66423254 1104 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1105 m, (char **)0, ln, (bool)UTF))
55da9344 1106 && (norun || regtry(prog, s)) )
1aa99e6b 1107 goto got_it;
80aecb99
JH
1108 else {
1109 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1110 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1111 if ( f != c
1112 && (f == c1 || f == c2)
1113 && (ln == foldlen ||
a6872d42 1114 !ibcmp_utf8((char *) foldbuf,
66423254 1115 (char **)0, foldlen, do_utf8,
d07ddd77 1116 m,
eb160463 1117 (char **)0, ln, (bool)UTF))
80aecb99
JH
1118 && (norun || regtry(prog, s)) )
1119 goto got_it;
1120 }
1aa99e6b
IH
1121 s += len;
1122 }
09091399 1123 }
1aa99e6b
IH
1124 }
1125 else {
1126 if (c1 == c2)
1127 while (s <= e) {
1128 if ( *(U8*)s == c1
1129 && (ln == 1 || !(OP(c) == EXACTF
1130 ? ibcmp(s, m, ln)
1131 : ibcmp_locale(s, m, ln)))
1132 && (norun || regtry(prog, s)) )
1133 goto got_it;
1134 s++;
1135 }
1136 else
1137 while (s <= e) {
1138 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1139 && (ln == 1 || !(OP(c) == EXACTF
1140 ? ibcmp(s, m, ln)
1141 : ibcmp_locale(s, m, ln)))
1142 && (norun || regtry(prog, s)) )
1143 goto got_it;
1144 s++;
1145 }
b3c9acc1
IZ
1146 }
1147 break;
bbce6d69 1148 case BOUNDL:
3280af22 1149 PL_reg_flags |= RF_tainted;
bbce6d69 1150 /* FALL THROUGH */
a0d0e21e 1151 case BOUND:
ffc61ed2 1152 if (do_utf8) {
12d33761 1153 if (s == PL_bostr)
ffc61ed2
JH
1154 tmp = '\n';
1155 else {
b4f7163a 1156 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1157
b4f7163a 1158 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1159 }
1160 tmp = ((OP(c) == BOUND ?
9041c2e3 1161 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1162 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1163 while (s < strend) {
1164 if (tmp == !(OP(c) == BOUND ?
3568d838 1165 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1166 isALNUM_LC_utf8((U8*)s)))
1167 {
1168 tmp = !tmp;
1169 if ((norun || regtry(prog, s)))
1170 goto got_it;
1171 }
1172 s += UTF8SKIP(s);
a687059c 1173 }
a0d0e21e 1174 }
667bb95a 1175 else {
12d33761 1176 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1177 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1178 while (s < strend) {
1179 if (tmp ==
1180 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1181 tmp = !tmp;
1182 if ((norun || regtry(prog, s)))
1183 goto got_it;
1184 }
1185 s++;
a0ed51b3 1186 }
a0ed51b3 1187 }
6eb5f6b9 1188 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1189 goto got_it;
1190 break;
bbce6d69 1191 case NBOUNDL:
3280af22 1192 PL_reg_flags |= RF_tainted;
bbce6d69 1193 /* FALL THROUGH */
a0d0e21e 1194 case NBOUND:
ffc61ed2 1195 if (do_utf8) {
12d33761 1196 if (s == PL_bostr)
ffc61ed2
JH
1197 tmp = '\n';
1198 else {
b4f7163a 1199 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1200
b4f7163a 1201 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1202 }
1203 tmp = ((OP(c) == NBOUND ?
9041c2e3 1204 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1205 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1206 while (s < strend) {
1207 if (tmp == !(OP(c) == NBOUND ?
3568d838 1208 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1209 isALNUM_LC_utf8((U8*)s)))
1210 tmp = !tmp;
1211 else if ((norun || regtry(prog, s)))
1212 goto got_it;
1213 s += UTF8SKIP(s);
1214 }
a0d0e21e 1215 }
667bb95a 1216 else {
12d33761 1217 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1218 tmp = ((OP(c) == NBOUND ?
1219 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1220 while (s < strend) {
1221 if (tmp ==
1222 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1223 tmp = !tmp;
1224 else if ((norun || regtry(prog, s)))
1225 goto got_it;
1226 s++;
1227 }
a0ed51b3 1228 }
6eb5f6b9 1229 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1230 goto got_it;
1231 break;
a0d0e21e 1232 case ALNUM:
ffc61ed2 1233 if (do_utf8) {
8269fa76 1234 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1235 while (s < strend) {
3568d838 1236 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1237 if (tmp && (norun || regtry(prog, s)))
1238 goto got_it;
1239 else
1240 tmp = doevery;
1241 }
bbce6d69 1242 else
ffc61ed2
JH
1243 tmp = 1;
1244 s += UTF8SKIP(s);
bbce6d69 1245 }
bbce6d69 1246 }
ffc61ed2
JH
1247 else {
1248 while (s < strend) {
1249 if (isALNUM(*s)) {
1250 if (tmp && (norun || regtry(prog, s)))
1251 goto got_it;
1252 else
1253 tmp = doevery;
1254 }
a0ed51b3 1255 else
ffc61ed2
JH
1256 tmp = 1;
1257 s++;
a0ed51b3 1258 }
a0ed51b3
LW
1259 }
1260 break;
bbce6d69 1261 case ALNUML:
3280af22 1262 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1263 if (do_utf8) {
1264 while (s < strend) {
1265 if (isALNUM_LC_utf8((U8*)s)) {
1266 if (tmp && (norun || regtry(prog, s)))
1267 goto got_it;
1268 else
1269 tmp = doevery;
1270 }
a687059c 1271 else
ffc61ed2
JH
1272 tmp = 1;
1273 s += UTF8SKIP(s);
a0d0e21e 1274 }
a0d0e21e 1275 }
ffc61ed2
JH
1276 else {
1277 while (s < strend) {
1278 if (isALNUM_LC(*s)) {
1279 if (tmp && (norun || regtry(prog, s)))
1280 goto got_it;
1281 else
1282 tmp = doevery;
1283 }
a0ed51b3 1284 else
ffc61ed2
JH
1285 tmp = 1;
1286 s++;
a0ed51b3 1287 }
a0ed51b3
LW
1288 }
1289 break;
a0d0e21e 1290 case NALNUM:
ffc61ed2 1291 if (do_utf8) {
8269fa76 1292 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1293 while (s < strend) {
3568d838 1294 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1295 if (tmp && (norun || regtry(prog, s)))
1296 goto got_it;
1297 else
1298 tmp = doevery;
1299 }
bbce6d69 1300 else
ffc61ed2
JH
1301 tmp = 1;
1302 s += UTF8SKIP(s);
bbce6d69 1303 }
bbce6d69 1304 }
ffc61ed2
JH
1305 else {
1306 while (s < strend) {
1307 if (!isALNUM(*s)) {
1308 if (tmp && (norun || regtry(prog, s)))
1309 goto got_it;
1310 else
1311 tmp = doevery;
1312 }
a0ed51b3 1313 else
ffc61ed2
JH
1314 tmp = 1;
1315 s++;
a0ed51b3 1316 }
a0ed51b3
LW
1317 }
1318 break;
bbce6d69 1319 case NALNUML:
3280af22 1320 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1321 if (do_utf8) {
1322 while (s < strend) {
1323 if (!isALNUM_LC_utf8((U8*)s)) {
1324 if (tmp && (norun || regtry(prog, s)))
1325 goto got_it;
1326 else
1327 tmp = doevery;
1328 }
a687059c 1329 else
ffc61ed2
JH
1330 tmp = 1;
1331 s += UTF8SKIP(s);
a687059c 1332 }
a0d0e21e 1333 }
ffc61ed2
JH
1334 else {
1335 while (s < strend) {
1336 if (!isALNUM_LC(*s)) {
1337 if (tmp && (norun || regtry(prog, s)))
1338 goto got_it;
1339 else
1340 tmp = doevery;
1341 }
a0ed51b3 1342 else
ffc61ed2
JH
1343 tmp = 1;
1344 s++;
a0ed51b3 1345 }
a0ed51b3
LW
1346 }
1347 break;
a0d0e21e 1348 case SPACE:
ffc61ed2 1349 if (do_utf8) {
8269fa76 1350 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1351 while (s < strend) {
3568d838 1352 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1353 if (tmp && (norun || regtry(prog, s)))
1354 goto got_it;
1355 else
1356 tmp = doevery;
1357 }
a0d0e21e 1358 else
ffc61ed2
JH
1359 tmp = 1;
1360 s += UTF8SKIP(s);
2304df62 1361 }
a0d0e21e 1362 }
ffc61ed2
JH
1363 else {
1364 while (s < strend) {
1365 if (isSPACE(*s)) {
1366 if (tmp && (norun || regtry(prog, s)))
1367 goto got_it;
1368 else
1369 tmp = doevery;
1370 }
a0ed51b3 1371 else
ffc61ed2
JH
1372 tmp = 1;
1373 s++;
a0ed51b3 1374 }
a0ed51b3
LW
1375 }
1376 break;
bbce6d69 1377 case SPACEL:
3280af22 1378 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1379 if (do_utf8) {
1380 while (s < strend) {
1381 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1382 if (tmp && (norun || regtry(prog, s)))
1383 goto got_it;
1384 else
1385 tmp = doevery;
1386 }
bbce6d69 1387 else
ffc61ed2
JH
1388 tmp = 1;
1389 s += UTF8SKIP(s);
bbce6d69 1390 }
bbce6d69 1391 }
ffc61ed2
JH
1392 else {
1393 while (s < strend) {
1394 if (isSPACE_LC(*s)) {
1395 if (tmp && (norun || regtry(prog, s)))
1396 goto got_it;
1397 else
1398 tmp = doevery;
1399 }
a0ed51b3 1400 else
ffc61ed2
JH
1401 tmp = 1;
1402 s++;
a0ed51b3 1403 }
a0ed51b3
LW
1404 }
1405 break;
a0d0e21e 1406 case NSPACE:
ffc61ed2 1407 if (do_utf8) {
8269fa76 1408 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1409 while (s < strend) {
3568d838 1410 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1411 if (tmp && (norun || regtry(prog, s)))
1412 goto got_it;
1413 else
1414 tmp = doevery;
1415 }
a0d0e21e 1416 else
ffc61ed2
JH
1417 tmp = 1;
1418 s += UTF8SKIP(s);
a687059c 1419 }
a0d0e21e 1420 }
ffc61ed2
JH
1421 else {
1422 while (s < strend) {
1423 if (!isSPACE(*s)) {
1424 if (tmp && (norun || regtry(prog, s)))
1425 goto got_it;
1426 else
1427 tmp = doevery;
1428 }
a0ed51b3 1429 else
ffc61ed2
JH
1430 tmp = 1;
1431 s++;
a0ed51b3 1432 }
a0ed51b3
LW
1433 }
1434 break;
bbce6d69 1435 case NSPACEL:
3280af22 1436 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1437 if (do_utf8) {
1438 while (s < strend) {
1439 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1440 if (tmp && (norun || regtry(prog, s)))
1441 goto got_it;
1442 else
1443 tmp = doevery;
1444 }
bbce6d69 1445 else
ffc61ed2
JH
1446 tmp = 1;
1447 s += UTF8SKIP(s);
bbce6d69 1448 }
bbce6d69 1449 }
ffc61ed2
JH
1450 else {
1451 while (s < strend) {
1452 if (!isSPACE_LC(*s)) {
1453 if (tmp && (norun || regtry(prog, s)))
1454 goto got_it;
1455 else
1456 tmp = doevery;
1457 }
a0ed51b3 1458 else
ffc61ed2
JH
1459 tmp = 1;
1460 s++;
a0ed51b3 1461 }
a0ed51b3
LW
1462 }
1463 break;
a0d0e21e 1464 case DIGIT:
ffc61ed2 1465 if (do_utf8) {
8269fa76 1466 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1467 while (s < strend) {
3568d838 1468 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1469 if (tmp && (norun || regtry(prog, s)))
1470 goto got_it;
1471 else
1472 tmp = doevery;
1473 }
a0d0e21e 1474 else
ffc61ed2
JH
1475 tmp = 1;
1476 s += UTF8SKIP(s);
2b69d0c2 1477 }
a0d0e21e 1478 }
ffc61ed2
JH
1479 else {
1480 while (s < strend) {
1481 if (isDIGIT(*s)) {
1482 if (tmp && (norun || regtry(prog, s)))
1483 goto got_it;
1484 else
1485 tmp = doevery;
1486 }
a0ed51b3 1487 else
ffc61ed2
JH
1488 tmp = 1;
1489 s++;
a0ed51b3 1490 }
a0ed51b3
LW
1491 }
1492 break;
b8c5462f
JH
1493 case DIGITL:
1494 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1495 if (do_utf8) {
1496 while (s < strend) {
1497 if (isDIGIT_LC_utf8((U8*)s)) {
1498 if (tmp && (norun || regtry(prog, s)))
1499 goto got_it;
1500 else
1501 tmp = doevery;
1502 }
b8c5462f 1503 else
ffc61ed2
JH
1504 tmp = 1;
1505 s += UTF8SKIP(s);
b8c5462f 1506 }
b8c5462f 1507 }
ffc61ed2
JH
1508 else {
1509 while (s < strend) {
1510 if (isDIGIT_LC(*s)) {
1511 if (tmp && (norun || regtry(prog, s)))
1512 goto got_it;
1513 else
1514 tmp = doevery;
1515 }
b8c5462f 1516 else
ffc61ed2
JH
1517 tmp = 1;
1518 s++;
b8c5462f 1519 }
b8c5462f
JH
1520 }
1521 break;
a0d0e21e 1522 case NDIGIT:
ffc61ed2 1523 if (do_utf8) {
8269fa76 1524 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1525 while (s < strend) {
3568d838 1526 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1527 if (tmp && (norun || regtry(prog, s)))
1528 goto got_it;
1529 else
1530 tmp = doevery;
1531 }
a0d0e21e 1532 else
ffc61ed2
JH
1533 tmp = 1;
1534 s += UTF8SKIP(s);
a687059c 1535 }
a0d0e21e 1536 }
ffc61ed2
JH
1537 else {
1538 while (s < strend) {
1539 if (!isDIGIT(*s)) {
1540 if (tmp && (norun || regtry(prog, s)))
1541 goto got_it;
1542 else
1543 tmp = doevery;
1544 }
a0ed51b3 1545 else
ffc61ed2
JH
1546 tmp = 1;
1547 s++;
a0ed51b3 1548 }
a0ed51b3
LW
1549 }
1550 break;
b8c5462f
JH
1551 case NDIGITL:
1552 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1553 if (do_utf8) {
1554 while (s < strend) {
1555 if (!isDIGIT_LC_utf8((U8*)s)) {
1556 if (tmp && (norun || regtry(prog, s)))
1557 goto got_it;
1558 else
1559 tmp = doevery;
1560 }
b8c5462f 1561 else
ffc61ed2
JH
1562 tmp = 1;
1563 s += UTF8SKIP(s);
b8c5462f 1564 }
a0ed51b3 1565 }
ffc61ed2
JH
1566 else {
1567 while (s < strend) {
1568 if (!isDIGIT_LC(*s)) {
1569 if (tmp && (norun || regtry(prog, s)))
1570 goto got_it;
1571 else
1572 tmp = doevery;
1573 }
cf93c79d 1574 else
ffc61ed2
JH
1575 tmp = 1;
1576 s++;
b8c5462f 1577 }
b8c5462f
JH
1578 }
1579 break;
b3c9acc1 1580 default:
3c3eec57
GS
1581 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1582 break;
d6a28714 1583 }
6eb5f6b9
JH
1584 return 0;
1585 got_it:
1586 return s;
1587}
1588
1589/*
1590 - regexec_flags - match a regexp against a string
1591 */
1592I32
1593Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1594 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1595/* strend: pointer to null at end of string */
1596/* strbeg: real beginning of string */
1597/* minend: end of match must be >=minend after stringarg. */
1598/* data: May be used for some additional optimizations. */
1599/* nosave: For optimizations. */
1600{
6eb5f6b9
JH
1601 register char *s;
1602 register regnode *c;
1603 register char *startpos = stringarg;
6eb5f6b9
JH
1604 I32 minlen; /* must match at least this many chars */
1605 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1606 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1607 constant substr. */ /* CC */
1608 I32 end_shift = 0; /* Same for the end. */ /* CC */
1609 I32 scream_pos = -1; /* Internal iterator of scream. */
1610 char *scream_olds;
1611 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1612 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1613#ifdef DEBUGGING
9e55ce06
JH
1614 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1615 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1616#endif
a30b2f1f 1617 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1618
1619 PL_regcc = 0;
1620
1621 cache_re(prog);
1622#ifdef DEBUGGING
aea4f609 1623 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1624#endif
1625
1626 /* Be paranoid... */
1627 if (prog == NULL || startpos == NULL) {
1628 Perl_croak(aTHX_ "NULL regexp parameter");
1629 return 0;
1630 }
1631
1632 minlen = prog->minlen;
61a36c01 1633 if (strend - startpos < minlen) {
a72c7584
JH
1634 DEBUG_r(PerlIO_printf(Perl_debug_log,
1635 "String too short [regexec_flags]...\n"));
1636 goto phooey;
1aa99e6b 1637 }
6eb5f6b9 1638
6eb5f6b9
JH
1639 /* Check validity of program. */
1640 if (UCHARAT(prog->program) != REG_MAGIC) {
1641 Perl_croak(aTHX_ "corrupted regexp program");
1642 }
1643
1644 PL_reg_flags = 0;
1645 PL_reg_eval_set = 0;
1646 PL_reg_maxiter = 0;
1647
1648 if (prog->reganch & ROPT_UTF8)
1649 PL_reg_flags |= RF_utf8;
1650
1651 /* Mark beginning of line for ^ and lookbehind. */
1652 PL_regbol = startpos;
1653 PL_bostr = strbeg;
1654 PL_reg_sv = sv;
1655
1656 /* Mark end of line for $ (and such) */
1657 PL_regeol = strend;
1658
1659 /* see how far we have to get to not match where we matched before */
1660 PL_regtill = startpos+minend;
1661
1662 /* We start without call_cc context. */
1663 PL_reg_call_cc = 0;
1664
1665 /* If there is a "must appear" string, look for it. */
1666 s = startpos;
1667
1668 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1669 MAGIC *mg;
1670
1671 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1672 PL_reg_ganch = startpos;
1673 else if (sv && SvTYPE(sv) >= SVt_PVMG
1674 && SvMAGIC(sv)
14befaf4
DM
1675 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1676 && mg->mg_len >= 0) {
6eb5f6b9
JH
1677 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1678 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1679 if (s > PL_reg_ganch)
6eb5f6b9
JH
1680 goto phooey;
1681 s = PL_reg_ganch;
1682 }
1683 }
1684 else /* pos() not defined */
1685 PL_reg_ganch = strbeg;
1686 }
1687
33b8afdf 1688 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1689 re_scream_pos_data d;
1690
1691 d.scream_olds = &scream_olds;
1692 d.scream_pos = &scream_pos;
1693 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1694 if (!s) {
1695 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1696 goto phooey; /* not present */
3fa9c3d7 1697 }
6eb5f6b9
JH
1698 }
1699
2a782b5b 1700 DEBUG_r({
9e55ce06
JH
1701 char *s0 = UTF ?
1702 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1703 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1704 prog->precomp;
1705 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1706 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1707 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1708 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1709 if (!PL_colorset)
1710 reginitcolors();
1711 PerlIO_printf(Perl_debug_log,
9e55ce06 1712 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1713 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1714 len0, len0, s0,
2a782b5b 1715 PL_colors[1],
9e55ce06 1716 len0 > 60 ? "..." : "",
2a782b5b 1717 PL_colors[0],
9e55ce06
JH
1718 (int)(len1 > 60 ? 60 : len1),
1719 s1, PL_colors[1],
1720 (len1 > 60 ? "..." : "")
2a782b5b
JH
1721 );
1722 });
6eb5f6b9
JH
1723
1724 /* Simplest case: anchored match need be tried only once. */
1725 /* [unless only anchor is BOL and multiline is set] */
1726 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1727 if (s == startpos && regtry(prog, startpos))
1728 goto got_it;
1729 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1730 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1731 {
1732 char *end;
1733
1734 if (minlen)
1735 dontbother = minlen - 1;
1aa99e6b 1736 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1737 /* for multiline we only have to try after newlines */
33b8afdf 1738 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1739 if (s == startpos)
1740 goto after_try;
1741 while (1) {
1742 if (regtry(prog, s))
1743 goto got_it;
1744 after_try:
1745 if (s >= end)
1746 goto phooey;
1747 if (prog->reganch & RE_USE_INTUIT) {
1748 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1749 if (!s)
1750 goto phooey;
1751 }
1752 else
1753 s++;
1754 }
1755 } else {
1756 if (s > startpos)
1757 s--;
1758 while (s < end) {
1759 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1760 if (regtry(prog, s))
1761 goto got_it;
1762 }
1763 }
1764 }
1765 }
1766 goto phooey;
1767 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1768 if (regtry(prog, PL_reg_ganch))
1769 goto got_it;
1770 goto phooey;
1771 }
1772
1773 /* Messy cases: unanchored match. */
33b8afdf 1774 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1775 /* we have /x+whatever/ */
1776 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1777 char ch;
bf93d4cc
GS
1778#ifdef DEBUGGING
1779 int did_match = 0;
1780#endif
33b8afdf
JH
1781 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1782 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1783 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1784
1aa99e6b 1785 if (do_utf8) {
6eb5f6b9
JH
1786 while (s < strend) {
1787 if (*s == ch) {
bf93d4cc 1788 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1789 if (regtry(prog, s)) goto got_it;
1790 s += UTF8SKIP(s);
1791 while (s < strend && *s == ch)
1792 s += UTF8SKIP(s);
1793 }
1794 s += UTF8SKIP(s);
1795 }
1796 }
1797 else {
1798 while (s < strend) {
1799 if (*s == ch) {
bf93d4cc 1800 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1801 if (regtry(prog, s)) goto got_it;
1802 s++;
1803 while (s < strend && *s == ch)
1804 s++;
1805 }
1806 s++;
1807 }
1808 }
b7953727 1809 DEBUG_r(if (!did_match)
bf93d4cc 1810 PerlIO_printf(Perl_debug_log,
b7953727
JH
1811 "Did not find anchored character...\n")
1812 );
6eb5f6b9
JH
1813 }
1814 /*SUPPRESS 560*/
33b8afdf
JH
1815 else if (prog->anchored_substr != Nullsv
1816 || prog->anchored_utf8 != Nullsv
1817 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1818 && prog->float_max_offset < strend - s)) {
1819 SV *must;
1820 I32 back_max;
1821 I32 back_min;
1822 char *last;
6eb5f6b9 1823 char *last1; /* Last position checked before */
bf93d4cc
GS
1824#ifdef DEBUGGING
1825 int did_match = 0;
1826#endif
33b8afdf
JH
1827 if (prog->anchored_substr || prog->anchored_utf8) {
1828 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1829 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1830 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1831 back_max = back_min = prog->anchored_offset;
1832 } else {
1833 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1834 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1835 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1836 back_max = prog->float_max_offset;
1837 back_min = prog->float_min_offset;
1838 }
1839 if (must == &PL_sv_undef)
1840 /* could not downgrade utf8 check substring, so must fail */
1841 goto phooey;
1842
1843 last = HOP3c(strend, /* Cannot start after this */
1844 -(I32)(CHR_SVLEN(must)
1845 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1846
1847 if (s > PL_bostr)
1848 last1 = HOPc(s, -1);
1849 else
1850 last1 = s - 1; /* bogus */
1851
1852 /* XXXX check_substr already used to find `s', can optimize if
1853 check_substr==must. */
1854 scream_pos = -1;
1855 dontbother = end_shift;
1856 strend = HOPc(strend, -dontbother);
1857 while ( (s <= last) &&
9041c2e3 1858 ((flags & REXEC_SCREAM)
1aa99e6b 1859 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1860 end_shift, &scream_pos, 0))
1aa99e6b 1861 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1862 (unsigned char*)strend, must,
6eb5f6b9 1863 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1864 /* we may be pointing at the wrong string */
1865 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
7ef91622 1866 s = strbeg + (s - SvPVX(sv));
bf93d4cc 1867 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1868 if (HOPc(s, -back_max) > last1) {
1869 last1 = HOPc(s, -back_min);
1870 s = HOPc(s, -back_max);
1871 }
1872 else {
1873 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1874
1875 last1 = HOPc(s, -back_min);
1876 s = t;
1877 }
1aa99e6b 1878 if (do_utf8) {
6eb5f6b9
JH
1879 while (s <= last1) {
1880 if (regtry(prog, s))
1881 goto got_it;
1882 s += UTF8SKIP(s);
1883 }
1884 }
1885 else {
1886 while (s <= last1) {
1887 if (regtry(prog, s))
1888 goto got_it;
1889 s++;
1890 }
1891 }
1892 }
b7953727
JH
1893 DEBUG_r(if (!did_match)
1894 PerlIO_printf(Perl_debug_log,
1895 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1896 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1897 ? "anchored" : "floating"),
1898 PL_colors[0],
1899 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1900 SvPVX(must),
b7953727
JH
1901 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1902 );
6eb5f6b9
JH
1903 goto phooey;
1904 }
155aba94 1905 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1906 if (minlen) {
1907 I32 op = (U8)OP(prog->regstclass);
66e933ab 1908 /* don't bother with what can't match */
f14c76ed
RGS
1909 if (PL_regkind[op] != EXACT && op != CANY)
1910 strend = HOPc(strend, -(minlen - 1));
1911 }
ffc61ed2
JH
1912 DEBUG_r({
1913 SV *prop = sv_newmortal();
9e55ce06
JH
1914 char *s0;
1915 char *s1;
1916 int len0;
1917 int len1;
1918
ffc61ed2 1919 regprop(prop, c);
9e55ce06
JH
1920 s0 = UTF ?
1921 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1922 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1923 SvPVX(prop);
1924 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1925 s1 = UTF ?
c728cb41 1926 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1927 len1 = UTF ? SvCUR(dsv1) : strend - s;
1928 PerlIO_printf(Perl_debug_log,
1929 "Matching stclass `%*.*s' against `%*.*s'\n",
1930 len0, len0, s0,
1931 len1, len1, s1);
ffc61ed2 1932 });
6eb5f6b9
JH
1933 if (find_byclass(prog, c, s, strend, startpos, 0))
1934 goto got_it;
bf93d4cc 1935 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1936 }
1937 else {
1938 dontbother = 0;
33b8afdf
JH
1939 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1940 /* Trim the end. */
d6a28714 1941 char *last;
33b8afdf
JH
1942 SV* float_real;
1943
1944 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1945 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1946 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1947
1948 if (flags & REXEC_SCREAM) {
33b8afdf 1949 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1950 end_shift, &scream_pos, 1); /* last one */
1951 if (!last)
ffc61ed2 1952 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1953 /* we may be pointing at the wrong string */
1954 else if (RX_MATCH_COPIED(prog))
7ef91622 1955 s = strbeg + (s - SvPVX(sv));
b8c5462f 1956 }
d6a28714
JH
1957 else {
1958 STRLEN len;
33b8afdf 1959 char *little = SvPV(float_real, len);
d6a28714 1960
33b8afdf 1961 if (SvTAIL(float_real)) {
d6a28714
JH
1962 if (memEQ(strend - len + 1, little, len - 1))
1963 last = strend - len + 1;
1964 else if (!PL_multiline)
9041c2e3 1965 last = memEQ(strend - len, little, len)
d6a28714 1966 ? strend - len : Nullch;
b8c5462f 1967 else
d6a28714
JH
1968 goto find_last;
1969 } else {
1970 find_last:
9041c2e3 1971 if (len)
d6a28714 1972 last = rninstr(s, strend, little, little + len);
b8c5462f 1973 else
d6a28714 1974 last = strend; /* matching `$' */
b8c5462f 1975 }
b8c5462f 1976 }
bf93d4cc
GS
1977 if (last == NULL) {
1978 DEBUG_r(PerlIO_printf(Perl_debug_log,
1979 "%sCan't trim the tail, match fails (should not happen)%s\n",
1980 PL_colors[4],PL_colors[5]));
1981 goto phooey; /* Should not happen! */
1982 }
d6a28714
JH
1983 dontbother = strend - last + prog->float_min_offset;
1984 }
1985 if (minlen && (dontbother < minlen))
1986 dontbother = minlen - 1;
1987 strend -= dontbother; /* this one's always in bytes! */
1988 /* We don't know much -- general case. */
1aa99e6b 1989 if (do_utf8) {
d6a28714
JH
1990 for (;;) {
1991 if (regtry(prog, s))
1992 goto got_it;
1993 if (s >= strend)
1994 break;
b8c5462f 1995 s += UTF8SKIP(s);
d6a28714
JH
1996 };
1997 }
1998 else {
1999 do {
2000 if (regtry(prog, s))
2001 goto got_it;
2002 } while (s++ < strend);
2003 }
2004 }
2005
2006 /* Failure. */
2007 goto phooey;
2008
2009got_it:
2010 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2011
2012 if (PL_reg_eval_set) {
2013 /* Preserve the current value of $^R */
2014 if (oreplsv != GvSV(PL_replgv))
2015 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2016 restored, the value remains
2017 the same. */
acfe0abc 2018 restore_pos(aTHX_ 0);
d6a28714
JH
2019 }
2020
2021 /* make sure $`, $&, $', and $digit will work later */
2022 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2023 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2024 if (flags & REXEC_COPY_STR) {
2025 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2026#ifdef PERL_COPY_ON_WRITE
2027 if ((SvIsCOW(sv)
2028 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2029 if (DEBUG_C_TEST) {
2030 PerlIO_printf(Perl_debug_log,
2031 "Copy on write: regexp capture, type %d\n",
2032 (int) SvTYPE(sv));
2033 }
2034 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2035 prog->subbeg = SvPVX(prog->saved_copy);
2036 assert (SvPOKp(prog->saved_copy));
2037 } else
2038#endif
2039 {
2040 RX_MATCH_COPIED_on(prog);
2041 s = savepvn(strbeg, i);
2042 prog->subbeg = s;
2043 }
d6a28714 2044 prog->sublen = i;
d6a28714
JH
2045 }
2046 else {
2047 prog->subbeg = strbeg;
2048 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2049 }
2050 }
9041c2e3 2051
d6a28714
JH
2052 return 1;
2053
2054phooey:
bf93d4cc
GS
2055 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2056 PL_colors[4],PL_colors[5]));
d6a28714 2057 if (PL_reg_eval_set)
acfe0abc 2058 restore_pos(aTHX_ 0);
d6a28714
JH
2059 return 0;
2060}
2061
2062/*
2063 - regtry - try match at specific point
2064 */
2065STATIC I32 /* 0 failure, 1 success */
2066S_regtry(pTHX_ regexp *prog, char *startpos)
2067{
d6a28714
JH
2068 register I32 i;
2069 register I32 *sp;
2070 register I32 *ep;
2071 CHECKPOINT lastcp;
2072
02db2b7b
IZ
2073#ifdef DEBUGGING
2074 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2075#endif
d6a28714
JH
2076 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2077 MAGIC *mg;
2078
2079 PL_reg_eval_set = RS_init;
2080 DEBUG_r(DEBUG_s(
b900a521
JH
2081 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2082 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2083 ));
e8347627 2084 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2085 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2086 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2087 SAVETMPS;
2088 /* Apparently this is not needed, judging by wantarray. */
e8347627 2089 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2090 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2091
2092 if (PL_reg_sv) {
2093 /* Make $_ available to executed code. */
2094 if (PL_reg_sv != DEFSV) {
4d1ff10f 2095 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2096 SAVESPTR(DEFSV);
2097 DEFSV = PL_reg_sv;
b8c5462f 2098 }
d6a28714 2099
9041c2e3 2100 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2101 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2102 /* prepare for quick setting of pos */
14befaf4
DM
2103 sv_magic(PL_reg_sv, (SV*)0,
2104 PERL_MAGIC_regex_global, Nullch, 0);
2105 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2106 mg->mg_len = -1;
b8c5462f 2107 }
d6a28714
JH
2108 PL_reg_magic = mg;
2109 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2110 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2111 }
09687e5a 2112 if (!PL_reg_curpm) {
0f79a09d 2113 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2114#ifdef USE_ITHREADS
2115 {
2116 SV* repointer = newSViv(0);
577e12cc 2117 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2118 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2119 av_push(PL_regex_padav,repointer);
2120 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2121 PL_regex_pad = AvARRAY(PL_regex_padav);
2122 }
2123#endif
2124 }
aaa362c4 2125 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2126 PL_reg_oldcurpm = PL_curpm;
2127 PL_curpm = PL_reg_curpm;
2128 if (RX_MATCH_COPIED(prog)) {
2129 /* Here is a serious problem: we cannot rewrite subbeg,
2130 since it may be needed if this match fails. Thus
2131 $` inside (?{}) could fail... */
2132 PL_reg_oldsaved = prog->subbeg;
2133 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2134#ifdef PERL_COPY_ON_WRITE
2135 PL_nrs = prog->saved_copy;
2136#endif
d6a28714
JH
2137 RX_MATCH_COPIED_off(prog);
2138 }
2139 else
2140 PL_reg_oldsaved = Nullch;
2141 prog->subbeg = PL_bostr;
2142 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2143 }
2144 prog->startp[0] = startpos - PL_bostr;
2145 PL_reginput = startpos;
2146 PL_regstartp = prog->startp;
2147 PL_regendp = prog->endp;
2148 PL_reglastparen = &prog->lastparen;
a01268b5 2149 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2150 prog->lastparen = 0;
03994de8 2151 prog->lastcloseparen = 0;
d6a28714
JH
2152 PL_regsize = 0;
2153 DEBUG_r(PL_reg_starttry = startpos);
2154 if (PL_reg_start_tmpl <= prog->nparens) {
2155 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2156 if(PL_reg_start_tmp)
2157 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2158 else
2159 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2160 }
2161
2162 /* XXXX What this code is doing here?!!! There should be no need
2163 to do this again and again, PL_reglastparen should take care of
3dd2943c 2164 this! --ilya*/
dafc8851
JH
2165
2166 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2167 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2168 * PL_reglastparen), is not needed at all by the test suite
2169 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2170 * enough, for building DynaLoader, or otherwise this
2171 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2172 * will happen. Meanwhile, this code *is* needed for the
2173 * above-mentioned test suite tests to succeed. The common theme
2174 * on those tests seems to be returning null fields from matches.
2175 * --jhi */
dafc8851 2176#if 1
d6a28714
JH
2177 sp = prog->startp;
2178 ep = prog->endp;
2179 if (prog->nparens) {
eb160463 2180 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2181 *++sp = -1;
2182 *++ep = -1;
2183 }
2184 }
dafc8851 2185#endif
02db2b7b 2186 REGCP_SET(lastcp);
d6a28714
JH
2187 if (regmatch(prog->program + 1)) {
2188 prog->endp[0] = PL_reginput - PL_bostr;
2189 return 1;
2190 }
02db2b7b 2191 REGCP_UNWIND(lastcp);
d6a28714
JH
2192 return 0;
2193}
2194
02db2b7b
IZ
2195#define RE_UNWIND_BRANCH 1
2196#define RE_UNWIND_BRANCHJ 2
2197
2198union re_unwind_t;
2199
2200typedef struct { /* XX: makes sense to enlarge it... */
2201 I32 type;
2202 I32 prev;
2203 CHECKPOINT lastcp;
2204} re_unwind_generic_t;
2205
2206typedef struct {
2207 I32 type;
2208 I32 prev;
2209 CHECKPOINT lastcp;
2210 I32 lastparen;
2211 regnode *next;
2212 char *locinput;
2213 I32 nextchr;
2214#ifdef DEBUGGING
2215 int regindent;
2216#endif
2217} re_unwind_branch_t;
2218
2219typedef union re_unwind_t {
2220 I32 type;
2221 re_unwind_generic_t generic;
2222 re_unwind_branch_t branch;
2223} re_unwind_t;
2224
8ba1375e
MJD
2225#define sayYES goto yes
2226#define sayNO goto no
e0f9d4a8 2227#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2228#define sayYES_FINAL goto yes_final
2229#define sayYES_LOUD goto yes_loud
2230#define sayNO_FINAL goto no_final
2231#define sayNO_SILENT goto do_no
2232#define saySAME(x) if (x) goto yes; else goto no
2233
2234#define REPORT_CODE_OFF 24
2235
d6a28714
JH
2236/*
2237 - regmatch - main matching routine
2238 *
2239 * Conceptually the strategy is simple: check to see whether the current
2240 * node matches, call self recursively to see whether the rest matches,
2241 * and then act accordingly. In practice we make some effort to avoid
2242 * recursion, in particular by going through "ordinary" nodes (that don't
2243 * need to know whether the rest of the match failed) by a loop instead of
2244 * by recursion.
2245 */
2246/* [lwall] I've hoisted the register declarations to the outer block in order to
2247 * maybe save a little bit of pushing and popping on the stack. It also takes
2248 * advantage of machines that use a register save mask on subroutine entry.
2249 */
2250STATIC I32 /* 0 failure, 1 success */
2251S_regmatch(pTHX_ regnode *prog)
2252{
d6a28714
JH
2253 register regnode *scan; /* Current node. */
2254 regnode *next; /* Next node. */
2255 regnode *inner; /* Next node in internal branch. */
2256 register I32 nextchr; /* renamed nextchr - nextchar colides with
2257 function of same name */
2258 register I32 n; /* no or next */
b7953727
JH
2259 register I32 ln = 0; /* len or last */
2260 register char *s = Nullch; /* operand or save */
d6a28714 2261 register char *locinput = PL_reginput;
b7953727 2262 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2263 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2264 I32 unwind = 0;
b7953727 2265#if 0
02db2b7b 2266 I32 firstcp = PL_savestack_ix;
b7953727 2267#endif
53c4c00c 2268 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2269#ifdef DEBUGGING
ce333219
JH
2270 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2271 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2272 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2273#endif
02db2b7b 2274
d6a28714
JH
2275#ifdef DEBUGGING
2276 PL_regindent++;
2277#endif
2278
2279 /* Note that nextchr is a byte even in UTF */
2280 nextchr = UCHARAT(locinput);
2281 scan = prog;
2282 while (scan != NULL) {
8ba1375e 2283
2a782b5b 2284 DEBUG_r( {
d6a28714
JH
2285 SV *prop = sv_newmortal();
2286 int docolor = *PL_colors[0];
2287 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2288 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2289 /* The part of the string before starttry has one color
2290 (pref0_len chars), between starttry and current
2291 position another one (pref_len - pref0_len chars),
2292 after the current position the third one.
2293 We assume that pref0_len <= pref_len, otherwise we
2294 decrease pref0_len. */
9041c2e3 2295 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2296 ? (5 + taill) - l : locinput - PL_bostr;
2297 int pref0_len;
d6a28714 2298
df1ffd02 2299 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2300 pref_len++;
2301 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2302 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2303 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2304 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2305 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2306 l--;
d6a28714
JH
2307 if (pref0_len < 0)
2308 pref0_len = 0;
2309 if (pref0_len > pref_len)
2310 pref0_len = pref_len;
2311 regprop(prop, scan);
2a782b5b
JH
2312 {
2313 char *s0 =
f14c76ed 2314 do_utf8 && OP(scan) != CANY ?
2a782b5b 2315 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2316 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2317 locinput - pref_len;
df1ffd02 2318 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2319 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2320 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2321 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2322 locinput - pref_len + pref0_len;
df1ffd02 2323 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2324 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2325 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2326 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2327 locinput;
df1ffd02 2328 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2329 PerlIO_printf(Perl_debug_log,
2330 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2331 (IV)(locinput - PL_bostr),
2332 PL_colors[4],
2333 len0, s0,
2334 PL_colors[5],
2335 PL_colors[2],
2336 len1, s1,
2337 PL_colors[3],
2338 (docolor ? "" : "> <"),
2339 PL_colors[0],
2340 len2, s2,
2341 PL_colors[1],
2342 15 - l - pref_len + 1,
2343 "",
2344 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2345 SvPVX(prop));
2346 }
2347 });
d6a28714
JH
2348
2349 next = scan + NEXT_OFF(scan);
2350 if (next == scan)
2351 next = NULL;
2352
2353 switch (OP(scan)) {
2354 case BOL:
12d33761
HS
2355 if (locinput == PL_bostr || (PL_multiline &&
2356 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2357 {
2358 /* regtill = regbol; */
b8c5462f
JH
2359 break;
2360 }
d6a28714
JH
2361 sayNO;
2362 case MBOL:
12d33761
HS
2363 if (locinput == PL_bostr ||
2364 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2365 {
b8c5462f
JH
2366 break;
2367 }
d6a28714
JH
2368 sayNO;
2369 case SBOL:
c2a73568 2370 if (locinput == PL_bostr)
b8c5462f 2371 break;
d6a28714
JH
2372 sayNO;
2373 case GPOS:
2374 if (locinput == PL_reg_ganch)
2375 break;
2376 sayNO;
2377 case EOL:
2378 if (PL_multiline)
2379 goto meol;
2380 else
2381 goto seol;
2382 case MEOL:
2383 meol:
2384 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2385 sayNO;
b8c5462f 2386 break;
d6a28714
JH
2387 case SEOL:
2388 seol:
2389 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2390 sayNO;
d6a28714 2391 if (PL_regeol - locinput > 1)
b8c5462f 2392 sayNO;
b8c5462f 2393 break;
d6a28714
JH
2394 case EOS:
2395 if (PL_regeol != locinput)
b8c5462f 2396 sayNO;
d6a28714 2397 break;
ffc61ed2 2398 case SANY:
d6a28714 2399 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2400 sayNO;
f33976b4
DB
2401 if (do_utf8) {
2402 locinput += PL_utf8skip[nextchr];
2403 if (locinput > PL_regeol)
2404 sayNO;
2405 nextchr = UCHARAT(locinput);
2406 }
2407 else
2408 nextchr = UCHARAT(++locinput);
2409 break;
2410 case CANY:
2411 if (!nextchr && locinput >= PL_regeol)
2412 sayNO;
b8c5462f 2413 nextchr = UCHARAT(++locinput);
a0d0e21e 2414 break;
ffc61ed2 2415 case REG_ANY:
1aa99e6b
IH
2416 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2417 sayNO;
2418 if (do_utf8) {
b8c5462f 2419 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2420 if (locinput > PL_regeol)
2421 sayNO;
a0ed51b3 2422 nextchr = UCHARAT(locinput);
a0ed51b3 2423 }
1aa99e6b
IH
2424 else
2425 nextchr = UCHARAT(++locinput);
a0ed51b3 2426 break;
d6a28714 2427 case EXACT:
cd439c50
IZ
2428 s = STRING(scan);
2429 ln = STR_LEN(scan);
eb160463 2430 if (do_utf8 != UTF) {
bc517b45 2431 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2432 char *l = locinput;
2433 char *e = s + ln;
bc517b45 2434 STRLEN ulen;
a72c7584 2435
5ff6fc6d
JH
2436 if (do_utf8) {
2437 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2438 while (s < e) {
2439 if (l >= PL_regeol)
5ff6fc6d
JH
2440 sayNO;
2441 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2442 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2443 ckWARN(WARN_UTF8) ?
2444 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2445 sayNO;
bc517b45 2446 l += ulen;
5ff6fc6d 2447 s ++;
1aa99e6b 2448 }
5ff6fc6d
JH
2449 }
2450 else {
2451 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2452 while (s < e) {
2453 if (l >= PL_regeol)
2454 sayNO;
5ff6fc6d 2455 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2456 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2457 ckWARN(WARN_UTF8) ?
2458 0 : UTF8_ALLOW_ANY))
1aa99e6b 2459 sayNO;
bc517b45 2460 s += ulen;
a72c7584 2461 l ++;
1aa99e6b 2462 }
5ff6fc6d 2463 }
1aa99e6b
IH
2464 locinput = l;
2465 nextchr = UCHARAT(locinput);
2466 break;
2467 }
bc517b45 2468 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2469 /* Inline the first character, for speed. */
2470 if (UCHARAT(s) != nextchr)
2471 sayNO;
2472 if (PL_regeol - locinput < ln)
2473 sayNO;
2474 if (ln > 1 && memNE(s, locinput, ln))
2475 sayNO;
2476 locinput += ln;
2477 nextchr = UCHARAT(locinput);
2478 break;
2479 case EXACTFL:
b8c5462f
JH
2480 PL_reg_flags |= RF_tainted;
2481 /* FALL THROUGH */
d6a28714 2482 case EXACTF:
cd439c50
IZ
2483 s = STRING(scan);
2484 ln = STR_LEN(scan);
d6a28714 2485
d07ddd77
JH
2486 if (do_utf8 || UTF) {
2487 /* Either target or the pattern are utf8. */
d6a28714 2488 char *l = locinput;
d07ddd77 2489 char *e = PL_regeol;
bc517b45 2490
eb160463 2491 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2492 l, &e, 0, do_utf8)) {
5486206c
JH
2493 /* One more case for the sharp s:
2494 * pack("U0U*", 0xDF) =~ /ss/i,
2495 * the 0xC3 0x9F are the UTF-8
2496 * byte sequence for the U+00DF. */
2497 if (!(do_utf8 &&
2498 toLOWER(s[0]) == 's' &&
2499 ln >= 2 &&
2500 toLOWER(s[1]) == 's' &&
2501 (U8)l[0] == 0xC3 &&
2502 e - l >= 2 &&
2503 (U8)l[1] == 0x9F))
2504 sayNO;
2505 }
d07ddd77
JH
2506 locinput = e;
2507 nextchr = UCHARAT(locinput);
2508 break;
a0ed51b3 2509 }
d6a28714 2510
bc517b45
JH
2511 /* Neither the target and the pattern are utf8. */
2512
d6a28714
JH
2513 /* Inline the first character, for speed. */
2514 if (UCHARAT(s) != nextchr &&
2515 UCHARAT(s) != ((OP(scan) == EXACTF)
2516 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2517 sayNO;
d6a28714 2518 if (PL_regeol - locinput < ln)
b8c5462f 2519 sayNO;
d6a28714
JH
2520 if (ln > 1 && (OP(scan) == EXACTF
2521 ? ibcmp(s, locinput, ln)
2522 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2523 sayNO;
d6a28714
JH
2524 locinput += ln;
2525 nextchr = UCHARAT(locinput);
a0d0e21e 2526 break;
d6a28714 2527 case ANYOF:
ffc61ed2 2528 if (do_utf8) {
9e55ce06
JH
2529 STRLEN inclasslen = PL_regeol - locinput;
2530
ba7b4546 2531 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2532 sayNO_ANYOF;
ffc61ed2
JH
2533 if (locinput >= PL_regeol)
2534 sayNO;
0f0076b4 2535 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2536 nextchr = UCHARAT(locinput);
e0f9d4a8 2537 break;
ffc61ed2
JH
2538 }
2539 else {
2540 if (nextchr < 0)
2541 nextchr = UCHARAT(locinput);
7d3e948e 2542 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2543 sayNO_ANYOF;
ffc61ed2
JH
2544 if (!nextchr && locinput >= PL_regeol)
2545 sayNO;
2546 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2547 break;
2548 }
2549 no_anyof:
2550 /* If we might have the case of the German sharp s
2551 * in a casefolding Unicode character class. */
2552
ebc501f0
JH
2553 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2554 locinput += SHARP_S_SKIP;
e0f9d4a8 2555 nextchr = UCHARAT(locinput);
ffc61ed2 2556 }
e0f9d4a8
JH
2557 else
2558 sayNO;
b8c5462f 2559 break;
d6a28714 2560 case ALNUML:
b8c5462f
JH
2561 PL_reg_flags |= RF_tainted;
2562 /* FALL THROUGH */
d6a28714 2563 case ALNUM:
b8c5462f 2564 if (!nextchr)
4633a7c4 2565 sayNO;
ffc61ed2 2566 if (do_utf8) {
ad24be35 2567 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2568 if (!(OP(scan) == ALNUM
3568d838 2569 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2570 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2571 {
2572 sayNO;
a0ed51b3 2573 }
b8c5462f 2574 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2575 nextchr = UCHARAT(locinput);
2576 break;
2577 }
ffc61ed2 2578 if (!(OP(scan) == ALNUM
d6a28714 2579 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2580 sayNO;
b8c5462f 2581 nextchr = UCHARAT(++locinput);
a0d0e21e 2582 break;
d6a28714 2583 case NALNUML:
b8c5462f
JH
2584 PL_reg_flags |= RF_tainted;
2585 /* FALL THROUGH */
d6a28714
JH
2586 case NALNUM:
2587 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2588 sayNO;
ffc61ed2 2589 if (do_utf8) {
8269fa76 2590 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2591 if (OP(scan) == NALNUM
3568d838 2592 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2593 : isALNUM_LC_utf8((U8*)locinput))
2594 {
b8c5462f 2595 sayNO;
d6a28714 2596 }
b8c5462f
JH
2597 locinput += PL_utf8skip[nextchr];
2598 nextchr = UCHARAT(locinput);
2599 break;
2600 }
ffc61ed2 2601 if (OP(scan) == NALNUM
d6a28714 2602 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2603 sayNO;
76e3520e 2604 nextchr = UCHARAT(++locinput);
a0d0e21e 2605 break;
d6a28714
JH
2606 case BOUNDL:
2607 case NBOUNDL:
3280af22 2608 PL_reg_flags |= RF_tainted;
bbce6d69 2609 /* FALL THROUGH */
d6a28714
JH
2610 case BOUND:
2611 case NBOUND:
2612 /* was last char in word? */
ffc61ed2 2613 if (do_utf8) {
12d33761
HS
2614 if (locinput == PL_bostr)
2615 ln = '\n';
ffc61ed2 2616 else {
b4f7163a 2617 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2618
b4f7163a 2619 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2620 }
2621 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2622 ln = isALNUM_uni(ln);
8269fa76 2623 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2624 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2625 }
2626 else {
9041c2e3 2627 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2628 n = isALNUM_LC_utf8((U8*)locinput);
2629 }
a0ed51b3 2630 }
d6a28714 2631 else {
12d33761
HS
2632 ln = (locinput != PL_bostr) ?
2633 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2634 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2635 ln = isALNUM(ln);
2636 n = isALNUM(nextchr);
2637 }
2638 else {
2639 ln = isALNUM_LC(ln);
2640 n = isALNUM_LC(nextchr);
2641 }
d6a28714 2642 }
ffc61ed2
JH
2643 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2644 OP(scan) == BOUNDL))
2645 sayNO;
a0ed51b3 2646 break;
d6a28714 2647 case SPACEL:
3280af22 2648 PL_reg_flags |= RF_tainted;
bbce6d69 2649 /* FALL THROUGH */
d6a28714 2650 case SPACE:
9442cb0e 2651 if (!nextchr)
4633a7c4 2652 sayNO;
1aa99e6b 2653 if (do_utf8) {
fd400ab9 2654 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2655 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2656 if (!(OP(scan) == SPACE
3568d838 2657 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2658 : isSPACE_LC_utf8((U8*)locinput)))
2659 {
2660 sayNO;
2661 }
2662 locinput += PL_utf8skip[nextchr];
2663 nextchr = UCHARAT(locinput);
2664 break;
d6a28714 2665 }
ffc61ed2
JH
2666 if (!(OP(scan) == SPACE
2667 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2668 sayNO;
2669 nextchr = UCHARAT(++locinput);
2670 }
2671 else {
2672 if (!(OP(scan) == SPACE
2673 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2674 sayNO;
2675 nextchr = UCHARAT(++locinput);
a0ed51b3 2676 }
a0ed51b3 2677 break;
d6a28714 2678 case NSPACEL:
3280af22 2679 PL_reg_flags |= RF_tainted;
bbce6d69 2680 /* FALL THROUGH */
d6a28714 2681 case NSPACE:
9442cb0e 2682 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2683 sayNO;
1aa99e6b 2684 if (do_utf8) {
8269fa76 2685 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2686 if (OP(scan) == NSPACE
3568d838 2687 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2688 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2689 {
2690 sayNO;
2691 }
2692 locinput += PL_utf8skip[nextchr];
2693 nextchr = UCHARAT(locinput);
2694 break;
a0ed51b3 2695 }
ffc61ed2 2696 if (OP(scan) == NSPACE
d6a28714 2697 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2698 sayNO;
76e3520e 2699 nextchr = UCHARAT(++locinput);
a0d0e21e 2700 break;
d6a28714 2701 case DIGITL:
a0ed51b3
LW
2702 PL_reg_flags |= RF_tainted;
2703 /* FALL THROUGH */
d6a28714 2704 case DIGIT:
9442cb0e 2705 if (!nextchr)
a0ed51b3 2706 sayNO;
1aa99e6b 2707 if (do_utf8) {
8269fa76 2708 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2709 if (!(OP(scan) == DIGIT
3568d838 2710 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2711 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2712 {
a0ed51b3 2713 sayNO;
dfe13c55 2714 }
6f06b55f 2715 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2716 nextchr = UCHARAT(locinput);
2717 break;
2718 }
ffc61ed2 2719 if (!(OP(scan) == DIGIT
9442cb0e 2720 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2721 sayNO;
2722 nextchr = UCHARAT(++locinput);
2723 break;
d6a28714 2724 case NDIGITL:
b8c5462f
JH
2725 PL_reg_flags |= RF_tainted;
2726 /* FALL THROUGH */
d6a28714 2727 case NDIGIT:
9442cb0e 2728 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2729 sayNO;
1aa99e6b 2730 if (do_utf8) {
8269fa76 2731 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2732 if (OP(scan) == NDIGIT
3568d838 2733 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2734 : isDIGIT_LC_utf8((U8*)locinput))
2735 {
a0ed51b3 2736 sayNO;
9442cb0e 2737 }
6f06b55f 2738 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2739 nextchr = UCHARAT(locinput);
2740 break;
2741 }
ffc61ed2 2742 if (OP(scan) == NDIGIT
9442cb0e 2743 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2744 sayNO;
2745 nextchr = UCHARAT(++locinput);
2746 break;
2747 case CLUMP:
b7c83a7e 2748 if (locinput >= PL_regeol)
a0ed51b3 2749 sayNO;
b7c83a7e
JH
2750 if (do_utf8) {
2751 LOAD_UTF8_CHARCLASS(mark,"~");
2752 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2753 sayNO;
2754 locinput += PL_utf8skip[nextchr];
2755 while (locinput < PL_regeol &&
2756 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2757 locinput += UTF8SKIP(locinput);
2758 if (locinput > PL_regeol)
2759 sayNO;
eb08e2da
JH
2760 }
2761 else
2762 locinput++;
a0ed51b3
LW
2763 nextchr = UCHARAT(locinput);
2764 break;
c8756f30 2765 case REFFL:
3280af22 2766 PL_reg_flags |= RF_tainted;
c8756f30 2767 /* FALL THROUGH */
c277df42 2768 case REF:
c8756f30 2769 case REFF:
c277df42 2770 n = ARG(scan); /* which paren pair */
cf93c79d 2771 ln = PL_regstartp[n];
2c2d71f5 2772 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2773 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2774 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2775 if (ln == PL_regendp[n])
a0d0e21e 2776 break;
a0ed51b3 2777
cf93c79d 2778 s = PL_bostr + ln;
1aa99e6b 2779 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2780 char *l = locinput;
cf93c79d 2781 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2782 /*
2783 * Note that we can't do the "other character" lookup trick as
2784 * in the 8-bit case (no pun intended) because in Unicode we
2785 * have to map both upper and title case to lower case.
2786 */
2787 if (OP(scan) == REFF) {
a2a2844f 2788 STRLEN ulen1, ulen2;
e7ae6809
JH
2789 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2790 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2791 while (s < e) {
2792 if (l >= PL_regeol)
2793 sayNO;
a2a2844f
JH
2794 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2795 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2796 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2797 sayNO;
a2a2844f
JH
2798 s += ulen1;
2799 l += ulen2;
a0ed51b3
LW
2800 }
2801 }
2802 locinput = l;
2803 nextchr = UCHARAT(locinput);
2804 break;
2805 }
2806
a0d0e21e 2807 /* Inline the first character, for speed. */
76e3520e 2808 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2809 (OP(scan) == REF ||
2810 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2811 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2812 sayNO;
cf93c79d 2813 ln = PL_regendp[n] - ln;
3280af22 2814 if (locinput + ln > PL_regeol)
4633a7c4 2815 sayNO;
c8756f30
AK
2816 if (ln > 1 && (OP(scan) == REF
2817 ? memNE(s, locinput, ln)
2818 : (OP(scan) == REFF
2819 ? ibcmp(s, locinput, ln)
2820 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2821 sayNO;
a0d0e21e 2822 locinput += ln;
76e3520e 2823 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2824 break;
2825
2826 case NOTHING:
c277df42 2827 case TAIL:
a0d0e21e
LW
2828 break;
2829 case BACK:
2830 break;
c277df42
IZ
2831 case EVAL:
2832 {
2833 dSP;
533c011a 2834 OP_4tree *oop = PL_op;
3280af22 2835 COP *ocurcop = PL_curcop;
f3548bdc 2836 PAD *old_comppad;
c277df42 2837 SV *ret;
9041c2e3 2838
c277df42 2839 n = ARG(scan);
533c011a 2840 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2841 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2842 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2843 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2844
8e5e9ebe
RGS
2845 {
2846 SV **before = SP;
2847 CALLRUNOPS(aTHX); /* Scalar context. */
2848 SPAGAIN;
2849 if (SP == before)
075aa684 2850 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2851 else {
2852 ret = POPs;
2853 PUTBACK;
2854 }
2855 }
2856
0f5d15d6 2857 PL_op = oop;
f3548bdc 2858 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2859 PL_curcop = ocurcop;
c277df42 2860 if (logical) {
0f5d15d6
IZ
2861 if (logical == 2) { /* Postponed subexpression. */
2862 regexp *re;
22c35a8c 2863 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2864 re_cc_state state;
0f5d15d6 2865 CHECKPOINT cp, lastcp;
cb50f42d 2866 int toggleutf;
faf82a0b 2867 register SV *sv;
0f5d15d6 2868
faf82a0b
AE
2869 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2870 mg = mg_find(sv, PERL_MAGIC_qr);
2871 else if (SvSMAGICAL(ret)) {
2872 if (SvGMAGICAL(ret))
2873 sv_unmagic(ret, PERL_MAGIC_qr);
2874 else
2875 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2876 }
faf82a0b 2877
0f5d15d6
IZ
2878 if (mg) {
2879 re = (regexp *)mg->mg_obj;
df0003d4 2880 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2881 }
2882 else {
2883 STRLEN len;
2884 char *t = SvPV(ret, len);
2885 PMOP pm;
2886 char *oprecomp = PL_regprecomp;
2887 I32 osize = PL_regsize;
2888 I32 onpar = PL_regnpar;
2889
5fcd1c1b 2890 Zero(&pm, 1, PMOP);
cb50f42d 2891 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2892 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2893 if (!(SvFLAGS(ret)
faf82a0b
AE
2894 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2895 | SVs_GMG)))
14befaf4
DM
2896 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2897 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2898 PL_regprecomp = oprecomp;
2899 PL_regsize = osize;
2900 PL_regnpar = onpar;
2901 }
2902 DEBUG_r(
9041c2e3 2903 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2904 "Entering embedded `%s%.60s%s%s'\n",
2905 PL_colors[0],
2906 re->precomp,
2907 PL_colors[1],
2908 (strlen(re->precomp) > 60 ? "..." : ""))
2909 );
2910 state.node = next;
2911 state.prev = PL_reg_call_cc;
2912 state.cc = PL_regcc;
2913 state.re = PL_reg_re;
2914
2ab05381 2915 PL_regcc = 0;
9041c2e3 2916
0f5d15d6 2917 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2918 REGCP_SET(lastcp);
0f5d15d6
IZ
2919 cache_re(re);
2920 state.ss = PL_savestack_ix;
2921 *PL_reglastparen = 0;
a01268b5 2922 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2923 PL_reg_call_cc = &state;
2924 PL_reginput = locinput;
cb50f42d
YST
2925 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2926 ((re->reganch & ROPT_UTF8) != 0);
2927 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2928
2929 /* XXXX This is too dramatic a measure... */
2930 PL_reg_maxiter = 0;
2931
0f5d15d6 2932 if (regmatch(re->program + 1)) {
2c914db6
IZ
2933 /* Even though we succeeded, we need to restore
2934 global variables, since we may be wrapped inside
2935 SUSPEND, thus the match may be not finished yet. */
2936
2937 /* XXXX Do this only if SUSPENDed? */
2938 PL_reg_call_cc = state.prev;
2939 PL_regcc = state.cc;
2940 PL_reg_re = state.re;
2941 cache_re(PL_reg_re);
cb50f42d 2942 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2943
2944 /* XXXX This is too dramatic a measure... */
2945 PL_reg_maxiter = 0;
2946
2947 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2948 ReREFCNT_dec(re);
2949 regcpblow(cp);
2950 sayYES;
2951 }
0f5d15d6 2952 ReREFCNT_dec(re);
02db2b7b 2953 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2954 regcppop();
2955 PL_reg_call_cc = state.prev;
2956 PL_regcc = state.cc;
2957 PL_reg_re = state.re;
d3790889 2958 cache_re(PL_reg_re);
cb50f42d 2959 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2960
2961 /* XXXX This is too dramatic a measure... */
2962 PL_reg_maxiter = 0;
2963
8e514ae6 2964 logical = 0;
0f5d15d6
IZ
2965 sayNO;
2966 }
c277df42 2967 sw = SvTRUE(ret);
0f5d15d6 2968 logical = 0;
a0ed51b3
LW
2969 }
2970 else
3280af22 2971 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2972 break;
2973 }
a0d0e21e 2974 case OPEN:
c277df42 2975 n = ARG(scan); /* which paren pair */
3280af22
NIS
2976 PL_reg_start_tmp[n] = locinput;
2977 if (n > PL_regsize)
2978 PL_regsize = n;
a0d0e21e
LW
2979 break;
2980 case CLOSE:
c277df42 2981 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2982 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2983 PL_regendp[n] = locinput - PL_bostr;
eb160463 2984 if (n > (I32)*PL_reglastparen)
3280af22 2985 *PL_reglastparen = n;
a01268b5 2986 *PL_reglastcloseparen = n;
a0d0e21e 2987 break;
c277df42
IZ
2988 case GROUPP:
2989 n = ARG(scan); /* which paren pair */
eb160463 2990 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2991 break;
2992 case IFTHEN:
2c2d71f5 2993 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2994 if (sw)
2995 next = NEXTOPER(NEXTOPER(scan));
2996 else {
2997 next = scan + ARG(scan);
2998 if (OP(next) == IFTHEN) /* Fake one. */
2999 next = NEXTOPER(NEXTOPER(next));
3000 }
3001 break;
3002 case LOGICAL:
0f5d15d6 3003 logical = scan->flags;
c277df42 3004 break;
2ab05381
IZ
3005/*******************************************************************
3006 PL_regcc contains infoblock about the innermost (...)* loop, and
3007 a pointer to the next outer infoblock.
3008
3009 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3010
3011 1) After matching X, regnode for CURLYX is processed;
3012
9041c2e3 3013 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3014 regmatch() recursively with the starting point at WHILEM node;
3015
3016 3) Each hit of WHILEM node tries to match A and Z (in the order
3017 depending on the current iteration, min/max of {min,max} and
3018 greediness). The information about where are nodes for "A"
3019 and "Z" is read from the infoblock, as is info on how many times "A"
3020 was already matched, and greediness.
3021
3022 4) After A matches, the same WHILEM node is hit again.
3023
3024 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3025 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3026 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3027 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3028 of the external loop.
3029
3030 Currently present infoblocks form a tree with a stem formed by PL_curcc
3031 and whatever it mentions via ->next, and additional attached trees
3032 corresponding to temporarily unset infoblocks as in "5" above.
3033
9041c2e3 3034 In the following picture infoblocks for outer loop of
2ab05381
IZ
3035 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3036 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3037 infoblocks are drawn below the "reset" infoblock.
3038
3039 In fact in the picture below we do not show failed matches for Z and T
3040 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3041 more obvious *why* one needs to *temporary* unset infoblocks.]
3042
3043 Matched REx position InfoBlocks Comment
3044 (Y(A)*?Z)*?T x
3045 Y(A)*?Z)*?T x <- O
3046 Y (A)*?Z)*?T x <- O
3047 Y A)*?Z)*?T x <- O <- I
3048 YA )*?Z)*?T x <- O <- I
3049 YA A)*?Z)*?T x <- O <- I
3050 YAA )*?Z)*?T x <- O <- I
3051 YAA Z)*?T x <- O # Temporary unset I
3052 I
3053
3054 YAAZ Y(A)*?Z)*?T x <- O
3055 I
3056
3057 YAAZY (A)*?Z)*?T x <- O
3058 I
3059
3060 YAAZY A)*?Z)*?T x <- O <- I
3061 I
3062
3063 YAAZYA )*?Z)*?T x <- O <- I
3064 I
3065
3066 YAAZYA Z)*?T x <- O # Temporary unset I
3067 I,I
3068
3069 YAAZYAZ )*?T x <- O
3070 I,I
3071
3072 YAAZYAZ T x # Temporary unset O
3073 O
3074 I,I
3075
3076 YAAZYAZT x
3077 O
3078 I,I
3079 *******************************************************************/
a0d0e21e
LW
3080 case CURLYX: {
3081 CURCUR cc;
3280af22 3082 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3083 /* No need to save/restore up to this paren */
3084 I32 parenfloor = scan->flags;
c277df42
IZ
3085
3086 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3087 next += ARG(next);
3280af22
NIS
3088 cc.oldcc = PL_regcc;
3089 PL_regcc = &cc;
cb434fcc
IZ
3090 /* XXXX Probably it is better to teach regpush to support
3091 parenfloor > PL_regsize... */
eb160463 3092 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3093 parenfloor = *PL_reglastparen; /* Pessimization... */
3094 cc.parenfloor = parenfloor;
a0d0e21e
LW
3095 cc.cur = -1;
3096 cc.min = ARG1(scan);
3097 cc.max = ARG2(scan);
c277df42 3098 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3099 cc.next = next;
3100 cc.minmod = minmod;
3101 cc.lastloc = 0;
3280af22 3102 PL_reginput = locinput;
a0d0e21e
LW
3103 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3104 regcpblow(cp);
3280af22 3105 PL_regcc = cc.oldcc;
4633a7c4 3106 saySAME(n);
a0d0e21e
LW
3107 }
3108 /* NOT REACHED */
3109 case WHILEM: {
3110 /*
3111 * This is really hard to understand, because after we match
3112 * what we're trying to match, we must make sure the rest of
2c2d71f5 3113 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3114 * to go back UP the parse tree by recursing ever deeper. And
3115 * if it fails, we have to reset our parent's current state
3116 * that we can try again after backing off.
3117 */
3118
c277df42 3119 CHECKPOINT cp, lastcp;
3280af22 3120 CURCUR* cc = PL_regcc;
c277df42
IZ
3121 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3122
4633a7c4 3123 n = cc->cur + 1; /* how many we know we matched */
3280af22 3124 PL_reginput = locinput;
a0d0e21e 3125
c277df42 3126 DEBUG_r(
9041c2e3 3127 PerlIO_printf(Perl_debug_log,
91f3b821 3128 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3129 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3130 (long)n, (long)cc->min,
2797576d 3131 (long)cc->max, PTR2UV(cc))
c277df42 3132 );
4633a7c4 3133
a0d0e21e
LW
3134 /* If degenerate scan matches "", assume scan done. */
3135
579cf2c3 3136 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3137 PL_regcc = cc->oldcc;
2ab05381
IZ
3138 if (PL_regcc)
3139 ln = PL_regcc->cur;
c277df42 3140 DEBUG_r(
c3464db5
DD
3141 PerlIO_printf(Perl_debug_log,
3142 "%*s empty match detected, try continuation...\n",
3280af22 3143 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3144 );
a0d0e21e 3145 if (regmatch(cc->next))
4633a7c4 3146 sayYES;
2ab05381
IZ
3147 if (PL_regcc)
3148 PL_regcc->cur = ln;
3280af22 3149 PL_regcc = cc;
4633a7c4 3150 sayNO;
a0d0e21e
LW
3151 }
3152
3153 /* First just match a string of min scans. */
3154
3155 if (n < cc->min) {
3156 cc->cur = n;
3157 cc->lastloc = locinput;
4633a7c4
LW
3158 if (regmatch(cc->scan))
3159 sayYES;
3160 cc->cur = n - 1;
c277df42 3161 cc->lastloc = lastloc;
4633a7c4 3162 sayNO;
a0d0e21e
LW
3163 }
3164
2c2d71f5
JH
3165 if (scan->flags) {
3166 /* Check whether we already were at this position.
3167 Postpone detection until we know the match is not
3168 *that* much linear. */
3169 if (!PL_reg_maxiter) {
3170 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3171 PL_reg_leftiter = PL_reg_maxiter;
3172 }
3173 if (PL_reg_leftiter-- == 0) {
3174 I32 size = (PL_reg_maxiter + 7)/8;
3175 if (PL_reg_poscache) {
eb160463 3176 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3177 Renew(PL_reg_poscache, size, char);
3178 PL_reg_poscache_size = size;
3179 }
3180 Zero(PL_reg_poscache, size, char);
3181 }
3182 else {
3183 PL_reg_poscache_size = size;
3184 Newz(29, PL_reg_poscache, size, char);
3185 }
3186 DEBUG_r(
3187 PerlIO_printf(Perl_debug_log,
3188 "%sDetected a super-linear match, switching on caching%s...\n",
3189 PL_colors[4], PL_colors[5])
3190 );
3191 }
3192 if (PL_reg_leftiter < 0) {
3193 I32 o = locinput - PL_bostr, b;
3194
3195 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3196 b = o % 8;
3197 o /= 8;
3198 if (PL_reg_poscache[o] & (1<<b)) {
3199 DEBUG_r(
3200 PerlIO_printf(Perl_debug_log,
3201 "%*s already tried at this position...\n",
3202 REPORT_CODE_OFF+PL_regindent*2, "")
3203 );
7821416a 3204 sayNO_SILENT;
2c2d71f5
JH
3205 }
3206 PL_reg_poscache[o] |= (1<<b);
3207 }
3208 }
3209
a0d0e21e
LW
3210 /* Prefer next over scan for minimal matching. */
3211
3212 if (cc->minmod) {
3280af22 3213 PL_regcc = cc->oldcc;
2ab05381
IZ
3214 if (PL_regcc)
3215 ln = PL_regcc->cur;
5f05dabc 3216 cp = regcppush(cc->parenfloor);
02db2b7b 3217 REGCP_SET(lastcp);
5f05dabc 3218 if (regmatch(cc->next)) {
c277df42 3219 regcpblow(cp);
4633a7c4 3220 sayYES; /* All done. */
5f05dabc 3221 }
02db2b7b 3222 REGCP_UNWIND(lastcp);
5f05dabc 3223 regcppop();
2ab05381
IZ
3224 if (PL_regcc)
3225 PL_regcc->cur = ln;
3280af22 3226 PL_regcc = cc;
a0d0e21e 3227
c277df42 3228 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3229 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3230 && !(PL_reg_flags & RF_warned)) {
3231 PL_reg_flags |= RF_warned;
9014280d 3232 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3233 "Complex regular subexpression recursion",
3234 REG_INFTY - 1);
c277df42 3235 }
4633a7c4 3236 sayNO;
c277df42 3237 }
a687059c 3238
c277df42 3239 DEBUG_r(
c3464db5
DD
3240 PerlIO_printf(Perl_debug_log,
3241 "%*s trying longer...\n",
3280af22 3242 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3243 );
a0d0e21e 3244 /* Try scanning more and see if it helps. */
3280af22 3245 PL_reginput = locinput;
a0d0e21e
LW
3246 cc->cur = n;
3247 cc->lastloc = locinput;
5f05dabc 3248 cp = regcppush(cc->parenfloor);
02db2b7b 3249 REGCP_SET(lastcp);
5f05dabc 3250 if (regmatch(cc->scan)) {
c277df42 3251 regcpblow(cp);
4633a7c4 3252 sayYES;
5f05dabc 3253 }
02db2b7b 3254 REGCP_UNWIND(lastcp);
5f05dabc 3255 regcppop();
4633a7c4 3256 cc->cur = n - 1;
c277df42 3257 cc->lastloc = lastloc;
4633a7c4 3258 sayNO;
a0d0e21e
LW
3259 }
3260
3261 /* Prefer scan over next for maximal matching. */
3262
3263 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3264 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3265 cc->cur = n;
3266 cc->lastloc = locinput;
02db2b7b 3267 REGCP_SET(lastcp);
5f05dabc 3268 if (regmatch(cc->scan)) {
c277df42 3269 regcpblow(cp);
4633a7c4 3270 sayYES;
5f05dabc 3271 }
02db2b7b 3272 REGCP_UNWIND(lastcp);
a0d0e21e 3273 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3274 PL_reginput = locinput;
c277df42 3275 DEBUG_r(
c3464db5
DD
3276 PerlIO_printf(Perl_debug_log,
3277 "%*s failed, try continuation...\n",
3280af22 3278 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3279 );
3280 }
9041c2e3 3281 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3282 && !(PL_reg_flags & RF_warned)) {
3280af22 3283 PL_reg_flags |= RF_warned;
9014280d 3284 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3285 "Complex regular subexpression recursion",
3286 REG_INFTY - 1);
a0d0e21e
LW
3287 }
3288
3289 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3290 PL_regcc = cc->oldcc;
2ab05381
IZ
3291 if (PL_regcc)
3292 ln = PL_regcc->cur;
a0d0e21e 3293 if (regmatch(cc->next))
4633a7c4 3294 sayYES;
2ab05381
IZ
3295 if (PL_regcc)
3296 PL_regcc->cur = ln;
3280af22 3297 PL_regcc = cc;
4633a7c4 3298 cc->cur = n - 1;
c277df42 3299 cc->lastloc = lastloc;
4633a7c4 3300 sayNO;
a0d0e21e
LW
3301 }
3302 /* NOT REACHED */
9041c2e3 3303 case BRANCHJ:
c277df42
IZ
3304 next = scan + ARG(scan);
3305 if (next == scan)
3306 next = NULL;
3307 inner = NEXTOPER(NEXTOPER(scan));
3308 goto do_branch;
9041c2e3 3309 case BRANCH:
c277df42
IZ
3310 inner = NEXTOPER(scan);
3311 do_branch:
3312 {
c277df42
IZ
3313 c1 = OP(scan);
3314 if (OP(next) != c1) /* No choice. */
3315 next = inner; /* Avoid recursion. */
a0d0e21e 3316 else {
02db2b7b
IZ
3317 I32 lastparen = *PL_reglastparen;
3318 I32 unwind1;
3319 re_unwind_branch_t *uw;
3320
3321 /* Put unwinding data on stack */
3322 unwind1 = SSNEWt(1,re_unwind_branch_t);
3323 uw = SSPTRt(unwind1,re_unwind_branch_t);
3324 uw->prev = unwind;
3325 unwind = unwind1;
3326 uw->type = ((c1 == BRANCH)
3327 ? RE_UNWIND_BRANCH
3328 : RE_UNWIND_BRANCHJ);
3329 uw->lastparen = lastparen;
3330 uw->next = next;
3331 uw->locinput = locinput;
3332 uw->nextchr = nextchr;
3333#ifdef DEBUGGING
3334 uw->regindent = ++PL_regindent;
3335#endif
c277df42 3336
02db2b7b
IZ
3337 REGCP_SET(uw->lastcp);
3338
3339 /* Now go into the first branch */
3340 next = inner;
a687059c 3341 }
a0d0e21e
LW
3342 }
3343 break;
3344 case MINMOD:
3345 minmod = 1;
3346 break;
c277df42
IZ
3347 case CURLYM:
3348 {
00db4c45 3349 I32 l = 0;
c277df42 3350 CHECKPOINT lastcp;
9041c2e3 3351
c277df42
IZ
3352 /* We suppose that the next guy does not need
3353 backtracking: in particular, it is of constant length,
3354 and has no parenths to influence future backrefs. */
3355 ln = ARG1(scan); /* min to match */
3356 n = ARG2(scan); /* max to match */
c277df42
IZ
3357 paren = scan->flags;
3358 if (paren) {
3280af22
NIS
3359 if (paren > PL_regsize)
3360 PL_regsize = paren;
eb160463 3361 if (paren > (I32)*PL_reglastparen)
3280af22 3362 *PL_reglastparen = paren;
c277df42 3363 }
dc45a647 3364 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3365 if (paren)
3366 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3367 PL_reginput = locinput;
c277df42
IZ
3368 if (minmod) {
3369 minmod = 0;
3370 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3371 sayNO;
f31a99c8
HS
3372 /* if we matched something zero-length we don't need to
3373 backtrack - capturing parens are already defined, so
3374 the caveat in the maximal case doesn't apply
3375
3376 XXXX if ln == 0, we can redo this check first time
3377 through the following loop
3378 */
3379 if (ln && l == 0)
3380 n = ln; /* don't backtrack */
3280af22 3381 locinput = PL_reginput;
cca55fe3 3382 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3383 regnode *text_node = next;
3384
cca55fe3 3385 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3386
cca55fe3 3387 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3388 else {
cca55fe3
JP
3389 if (PL_regkind[(U8)OP(text_node)] == REF) {
3390 I32 n, ln;
3391 n = ARG(text_node); /* which paren pair */
3392 ln = PL_regstartp[n];
3393 /* assume yes if we haven't seen CLOSEn */
3394 if (
eb160463 3395 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3396 ln == -1 ||
3397 ln == PL_regendp[n]
3398 ) {
3399 c1 = c2 = -1000;
3400 goto assume_ok_MM;
3401 }
3402 c1 = *(PL_bostr + ln);
3403 }
3404 else { c1 = (U8)*STRING(text_node); }
af5decee 3405 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3406 c2 = PL_fold[c1];
af5decee 3407 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3408 c2 = PL_fold_locale[c1];
3409 else
3410 c2 = c1;
3411 }
a0ed51b3
LW
3412 }
3413 else
c277df42 3414 c1 = c2 = -1000;
cca55fe3 3415 assume_ok_MM:
02db2b7b 3416 REGCP_SET(lastcp);
5f4b28b2 3417 /* This may be improved if l == 0. */
c277df42
IZ
3418 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3419 /* If it could work, try it. */
3420 if (c1 == -1000 ||
3280af22
NIS
3421 UCHARAT(PL_reginput) == c1 ||
3422 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3423 {
3424 if (paren) {
f31a99c8 3425 if (ln) {
cf93c79d
IZ
3426 PL_regstartp[paren] =
3427 HOPc(PL_reginput, -l) - PL_bostr;
3428 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3429 }
3430 else
cf93c79d 3431 PL_regendp[paren] = -1;
c277df42
IZ
3432 }
3433 if (regmatch(next))
3434 sayYES;
02db2b7b 3435 REGCP_UNWIND(lastcp);
c277df42
IZ
3436 }
3437 /* Couldn't or didn't -- move forward. */
3280af22 3438 PL_reginput = locinput;
c277df42
IZ
3439 if (regrepeat_hard(scan, 1, &l)) {
3440 ln++;
3280af22 3441 locinput = PL_reginput;
c277df42
IZ
3442 }
3443 else
3444 sayNO;
3445 }
a0ed51b3
LW
3446 }
3447 else {
c277df42 3448 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3449 /* if we matched something zero-length we don't need to
3450 backtrack, unless the minimum count is zero and we
3451 are capturing the result - in that case the capture
3452 being defined or not may affect later execution
3453 */
3454 if (n != 0 && l == 0 && !(paren && ln == 0))
3455 ln = n; /* don't backtrack */
3280af22 3456 locinput = PL_reginput;
c277df42 3457 DEBUG_r(
5c0ca799 3458 PerlIO_printf(Perl_debug_log,
faccc32b 3459 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3460 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3461 (IV) n, (IV)l)
c277df42
IZ
3462 );
3463 if (n >= ln) {
cca55fe3 3464 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3465 regnode *text_node = next;
3466
cca55fe3 3467 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3468
cca55fe3 3469 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3470 else {
cca55fe3
JP
3471 if (PL_regkind[(U8)OP(text_node)] == REF) {
3472 I32 n, ln;
3473 n = ARG(text_node); /* which paren pair */
3474 ln = PL_regstartp[n];
3475 /* assume yes if we haven't seen CLOSEn */
3476 if (
eb160463 3477 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3478 ln == -1 ||
3479 ln == PL_regendp[n]
3480 ) {
3481 c1 = c2 = -1000;
3482 goto assume_ok_REG;
3483 }
3484 c1 = *(PL_bostr + ln);
3485 }
3486 else { c1 = (U8)*STRING(text_node); }
3487
af5decee 3488 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3489 c2 = PL_fold[c1];
af5decee 3490 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3491 c2 = PL_fold_locale[c1];
3492 else
3493 c2 = c1;
3494 }
a0ed51b3
LW
3495 }
3496 else
c277df42
IZ
3497 c1 = c2 = -1000;
3498 }
cca55fe3 3499 assume_ok_REG:
02db2b7b 3500 REGCP_SET(lastcp);
c277df42
IZ
3501 while (n >= ln) {
3502 /* If it could work, try it. */
3503 if (c1 == -1000 ||
3280af22
NIS
3504 UCHARAT(PL_reginput) == c1 ||
3505 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3506 {
3507 DEBUG_r(
c3464db5 3508 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3509 "%*s trying tail with n=%"IVdf"...\n",
3510 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3511 );
3512 if (paren) {
3513 if (n) {
cf93c79d
IZ
3514 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3515 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3516 }
a0ed51b3 3517 else
cf93c79d 3518 PL_regendp[paren] = -1;
c277df42 3519 }
a0ed51b3
LW
3520 if (regmatch(next))
3521 sayYES;
02db2b7b 3522 REGCP_UNWIND(lastcp);
a0ed51b3 3523 }
c277df42
IZ
3524 /* Couldn't or didn't -- back up. */
3525 n--;
dfe13c55 3526 locinput = HOPc(locinput, -l);
3280af22 3527 PL_reginput = locinput;
c277df42
IZ
3528 }
3529 }
3530 sayNO;
3531 break;
3532 }
3533 case CURLYN:
3534 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3535 if (paren > PL_regsize)
3536 PL_regsize = paren;
eb160463 3537 if (paren > (I32)*PL_reglastparen)
3280af22 3538 *PL_reglastparen = paren;
c277df42
IZ
3539 ln = ARG1(scan); /* min to match */
3540 n = ARG2(scan); /* max to match */
dc45a647 3541 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3542 goto repeat;
a0d0e21e 3543 case CURLY:
c277df42 3544 paren = 0;
a0d0e21e
LW
3545 ln = ARG1(scan); /* min to match */
3546 n = ARG2(scan); /* max to match */
dc45a647 3547 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3548 goto repeat;
3549 case STAR:
3550 ln = 0;
c277df42 3551 n = REG_INFTY;
a0d0e21e 3552 scan = NEXTOPER(scan);
c277df42 3553 paren = 0;
a0d0e21e
LW
3554 goto repeat;
3555 case PLUS:
c277df42
IZ
3556 ln = 1;
3557 n = REG_INFTY;
3558 scan = NEXTOPER(scan);
3559 paren = 0;
3560 repeat:
a0d0e21e
LW
3561 /*
3562 * Lookahead to avoid useless match attempts
3563 * when we know what character comes next.
3564 */
5f80c4cf
JP
3565
3566 /*
3567 * Used to only do .*x and .*?x, but now it allows
3568 * for )'s, ('s and (?{ ... })'s to be in the way
3569 * of the quantifier and the EXACT-like node. -- japhy
3570 */
3571
cca55fe3 3572 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3573 U8 *s;
3574 regnode *text_node = next;
3575
cca55fe3 3576 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3577
cca55fe3 3578 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3579 else {
cca55fe3
JP
3580 if (PL_regkind[(U8)OP(text_node)] == REF) {
3581 I32 n, ln;
3582 n = ARG(text_node); /* which paren pair */
3583 ln = PL_regstartp[n];
3584 /* assume yes if we haven't seen CLOSEn */
3585 if (
eb160463 3586 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3587 ln == -1 ||
3588 ln == PL_regendp[n]
3589 ) {
3590 c1 = c2 = -1000;
3591 goto assume_ok_easy;
3592 }
9246c65e 3593 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3594 }
3595 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3596
3597 if (!UTF) {
3598 c2 = c1 = *s;
f65d3ee7 3599 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3600 c2 = PL_fold[c1];
f65d3ee7 3601 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3602 c2 = PL_fold_locale[c1];
1aa99e6b 3603 }
5f80c4cf 3604 else { /* UTF */
f65d3ee7 3605 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3606 STRLEN ulen1, ulen2;
e7ae6809
JH
3607 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3608 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3609
3610 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3611 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3612
872c91ae
JH
3613 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3614 ckWARN(WARN_UTF8) ?
3615 0 : UTF8_ALLOW_ANY);
3616 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3617 ckWARN(WARN_UTF8) ?
3618 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3619 }
3620 else {
872c91ae
JH
3621 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3622 ckWARN(WARN_UTF8) ?
3623 0 : UTF8_ALLOW_ANY);
5f80c4cf 3624 }
1aa99e6b
IH
3625 }
3626 }
bbce6d69 3627 }
a0d0e21e 3628 else
bbce6d69 3629 c1 = c2 = -1000;
cca55fe3 3630 assume_ok_easy:
3280af22 3631 PL_reginput = locinput;
a0d0e21e 3632 if (minmod) {
c277df42 3633 CHECKPOINT lastcp;
a0d0e21e
LW
3634 minmod = 0;
3635 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3636 sayNO;
a0ed51b3 3637 locinput = PL_reginput;
02db2b7b 3638 REGCP_SET(lastcp);
0fe9bf95 3639 if (c1 != -1000) {
1aa99e6b 3640 char *e; /* Should not check after this */
0fe9bf95 3641 char *old = locinput;
b2f2f093 3642 int count = 0;
0fe9bf95 3643
1aa99e6b 3644 if (n == REG_INFTY) {
0fe9bf95 3645 e = PL_regeol - 1;
1aa99e6b
IH
3646 if (do_utf8)
3647 while (UTF8_IS_CONTINUATION(*(U8*)e))
3648 e--;
3649 }
3650 else if (do_utf8) {
3651 int m = n - ln;
3652 for (e = locinput;
3653 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3654 e += UTF8SKIP(e);
3655 }
3656 else {
3657 e = locinput + n - ln;
3658 if (e >= PL_regeol)
3659 e = PL_regeol - 1;
3660 }
0fe9bf95
IZ
3661 while (1) {
3662 /* Find place 'next' could work */
1aa99e6b
IH
3663 if (!do_utf8) {
3664 if (c1 == c2) {
a8e8ab15
JH
3665 while (locinput <= e &&
3666 UCHARAT(locinput) != c1)
1aa99e6b
IH
3667 locinput++;
3668 } else {
9041c2e3 3669 while (locinput <= e
a8e8ab15
JH
3670 && UCHARAT(locinput) != c1
3671 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3672 locinput++;
3673 }
3674 count = locinput - old;
3675 }
3676 else {
3677 STRLEN len;
3678 if (c1 == c2) {
872c91ae
JH
3679 /* count initialised to
3680 * utf8_distance(old, locinput) */
b2f2f093 3681 while (locinput <= e &&
872c91ae
JH
3682 utf8n_to_uvchr((U8*)locinput,
3683 UTF8_MAXLEN, &len,
3684 ckWARN(WARN_UTF8) ?
eb160463 3685 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3686 locinput += len;
b2f2f093
JH
3687 count++;
3688 }
1aa99e6b 3689 } else {
872c91ae
JH
3690 /* count initialised to
3691 * utf8_distance(old, locinput) */
b2f2f093 3692 while (locinput <= e) {
872c91ae
JH
3693 UV c = utf8n_to_uvchr((U8*)locinput,
3694 UTF8_MAXLEN, &len,
3695 ckWARN(WARN_UTF8) ?
3696 0 : UTF8_ALLOW_ANY);
eb160463 3697 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3698 break;
b2f2f093
JH
3699 locinput += len;
3700 count++;
1aa99e6b
IH
3701 }
3702 }
0fe9bf95 3703 }
9041c2e3 3704 if (locinput > e)
0fe9bf95
IZ
3705 sayNO;
3706 /* PL_reginput == old now */
3707 if (locinput != old) {
3708 ln = 1; /* Did some */
1aa99e6b 3709 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3710 sayNO;
3711 }
3712 /* PL_reginput == locinput now */
29d1e993 3713 TRYPAREN(paren, ln, locinput);
0fe9bf95 3714 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3715 REGCP_UNWIND(lastcp);
0fe9bf95 3716 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3717 old = locinput;
3718 if (do_utf8)
3719 locinput += UTF8SKIP(locinput);
3720 else
3721 locinput++;
b2f2f093 3722 count = 1;
0fe9bf95
IZ
3723 }
3724 }
3725 else
c277df42 3726 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3727 UV c;
3728 if (c1 != -1000) {
3729 if (do_utf8)
872c91ae
JH
3730 c = utf8n_to_uvchr((U8*)PL_reginput,
3731 UTF8_MAXLEN, 0,
3732 ckWARN(WARN_UTF8) ?
3733 0 : UTF8_ALLOW_ANY);
1aa99e6b 3734 else
9041c2e3 3735 c = UCHARAT(PL_reginput);
2390ecbc 3736 /* If it could work, try it. */
eb160463 3737 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3738 {
ecc99935 3739 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3740 REGCP_UNWIND(lastcp);
3741 }
1aa99e6b 3742 }
a0d0e21e 3743 /* If it could work, try it. */
2390ecbc 3744 else if (c1 == -1000)
bbce6d69 3745 {
ecc99935 3746 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3747 REGCP_UNWIND(lastcp);
bbce6d69 3748 }
c277df42 3749 /* Couldn't or didn't -- move forward. */
a0ed51b3 3750 PL_reginput = locinput;
a0d0e21e
LW
3751 if (regrepeat(scan, 1)) {
3752 ln++;
a0ed51b3
LW
3753 locinput = PL_reginput;
3754 }
3755 else
4633a7c4 3756 sayNO;
a0d0e21e
LW
3757 }
3758 }
3759 else {
c277df42 3760 CHECKPOINT lastcp;
a0d0e21e 3761 n = regrepeat(scan, n);
a0ed51b3 3762 locinput = PL_reginput;
22c35a8c 3763 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3764 ((!PL_multiline && OP(next) != MEOL) ||
3765 OP(next) == SEOL || OP(next) == EOS))
3766 {
a0d0e21e 3767 ln = n; /* why back off? */
1aeab75a
GS
3768 /* ...because $ and \Z can match before *and* after
3769 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3770 We should back off by one in this case. */
3771 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3772 ln--;
3773 }
02db2b7b 3774 REGCP_SET(lastcp);
c277df42 3775 if (paren) {
8fa7f367 3776 UV c = 0;
c277df42 3777 while (n >= ln) {
1aa99e6b
IH
3778 if (c1 != -1000) {
3779 if (do_utf8)
872c91ae
JH
3780 c = utf8n_to_uvchr((U8*)PL_reginput,
3781 UTF8_MAXLEN, 0,
3782 ckWARN(WARN_UTF8) ?
3783 0 : UTF8_ALLOW_ANY);
1aa99e6b 3784 else
9041c2e3 3785 c = UCHARAT(PL_reginput);
1aa99e6b 3786 }
c277df42 3787 /* If it could work, try it. */
eb160463 3788 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3789 {
29d1e993 3790 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3791 REGCP_UNWIND(lastcp);
c277df42
IZ
3792 }
3793 /* Couldn't or didn't -- back up. */
3794 n--;
dfe13c55 3795 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3796 }
a0ed51b3
LW
3797 }
3798 else {
8fa7f367 3799 UV c = 0;
c277df42 3800 while (n >= ln) {
1aa99e6b
IH
3801 if (c1 != -1000) {
3802 if (do_utf8)
872c91ae
JH
3803 c = utf8n_to_uvchr((U8*)PL_reginput,
3804 UTF8_MAXLEN, 0,
3805 ckWARN(WARN_UTF8) ?
3806 0 : UTF8_ALLOW_ANY);
1aa99e6b 3807 else
9041c2e3 3808 c = UCHARAT(PL_reginput);
1aa99e6b 3809 }
c277df42 3810 /* If it could work, try it. */
eb160463 3811 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3812 {
29d1e993 3813 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3814 REGCP_UNWIND(lastcp);
c277df42
IZ
3815 }
3816 /* Couldn't or didn't -- back up. */
3817 n--;
dfe13c55 3818 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3819 }
a0d0e21e
LW
3820 }
3821 }
4633a7c4 3822 sayNO;
c277df42 3823 break;
a0d0e21e 3824 case END:
0f5d15d6
IZ
3825 if (PL_reg_call_cc) {
3826 re_cc_state *cur_call_cc = PL_reg_call_cc;
3827 CURCUR *cctmp = PL_regcc;
3828 regexp *re = PL_reg_re;
3829 CHECKPOINT cp, lastcp;
3830
3831 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3832 REGCP_SET(lastcp);
0f5d15d6
IZ
3833 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3834 the caller. */
3835 PL_reginput = locinput; /* Make position available to
3836 the callcc. */
3837 cache_re(PL_reg_call_cc->re);
3838 PL_regcc = PL_reg_call_cc->cc;
3839 PL_reg_call_cc = PL_reg_call_cc->prev;
3840 if (regmatch(cur_call_cc->node)) {
3841 PL_reg_call_cc = cur_call_cc;
3842 regcpblow(cp);
3843 sayYES;
3844 }
02db2b7b 3845 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3846 regcppop();
3847 PL_reg_call_cc = cur_call_cc;
3848 PL_regcc = cctmp;
3849 PL_reg_re = re;
3850 cache_re(re);
3851
3852 DEBUG_r(
3853 PerlIO_printf(Perl_debug_log,
3854 "%*s continuation failed...\n",
3855 REPORT_CODE_OFF+PL_regindent*2, "")
3856 );
7821416a 3857 sayNO_SILENT;
0f5d15d6 3858 }
7821416a
IZ
3859 if (locinput < PL_regtill) {
3860 DEBUG_r(PerlIO_printf(Perl_debug_log,
3861 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3862 PL_colors[4],
3863 (long)(locinput - PL_reg_starttry),
3864 (long)(PL_regtill - PL_reg_starttry),
3865 PL_colors[5]));
3866 sayNO_FINAL; /* Cannot match: too short. */
3867 }
3868 PL_reginput = locinput; /* put where regtry can find it */
3869 sayYES_FINAL; /* Success! */
7e5428c5 3870 case SUCCEED:
3280af22 3871 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3872 sayYES_LOUD; /* Success! */
c277df42
IZ
3873 case SUSPEND:
3874 n = 1;
9fe1d20c 3875 PL_reginput = locinput;
9041c2e3 3876 goto do_ifmatch;
a0d0e21e 3877 case UNLESSM:
c277df42 3878 n = 0;
a0ed51b3 3879 if (scan->flags) {
efb30f32
HS
3880 s = HOPBACKc(locinput, scan->flags);
3881 if (!s)
3882 goto say_yes;
3883 PL_reginput = s;
a0ed51b3
LW
3884 }
3885 else
3886 PL_reginput = locinput;
c277df42
IZ
3887 goto do_ifmatch;
3888 case IFMATCH:
3889 n = 1;
a0ed51b3 3890 if (scan->flags) {
efb30f32
HS
3891 s = HOPBACKc(locinput, scan->flags);
3892 if (!s)
3893 goto say_no;
3894 PL_reginput = s;
a0ed51b3
LW
3895 }
3896 else
3897 PL_reginput = locinput;
3898
c277df42 3899 do_ifmatch:
c277df42
IZ
3900 inner = NEXTOPER(NEXTOPER(scan));
3901 if (regmatch(inner) != n) {
3902 say_no:
3903 if (logical) {
3904 logical = 0;
3905 sw = 0;
3906 goto do_longjump;
a0ed51b3
LW
3907 }
3908 else
c277df42
IZ
3909 sayNO;
3910 }
3911 say_yes:
3912 if (logical) {
3913 logical = 0;
3914 sw = 1;
3915 }
fe44a5e8 3916 if (OP(scan) == SUSPEND) {
3280af22 3917 locinput = PL_reginput;
565764a8 3918 nextchr = UCHARAT(locinput);
fe44a5e8 3919 }
c277df42
IZ
3920 /* FALL THROUGH. */
3921 case LONGJMP:
3922 do_longjump:
3923 next = scan + ARG(scan);
3924 if (next == scan)
3925 next = NULL;
a0d0e21e
LW
3926 break;
3927 default:
b900a521 3928 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3929 PTR2UV(scan), OP(scan));
cea2e8a9 3930 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3931 }
02db2b7b 3932 reenter:
a0d0e21e
LW
3933 scan = next;
3934 }
a687059c 3935
a0d0e21e
LW
3936 /*
3937 * We get here only if there's trouble -- normally "case END" is
3938 * the terminating point.
3939 */
cea2e8a9 3940 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3941 /*NOTREACHED*/
4633a7c4
LW
3942 sayNO;
3943
7821416a
IZ
3944yes_loud:
3945 DEBUG_r(
3946 PerlIO_printf(Perl_debug_log,
3947 "%*s %scould match...%s\n",
3948 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3949 );
3950 goto yes;
3951yes_final:
3952 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3953 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3954yes:
3955#ifdef DEBUGGING
3280af22 3956 PL_regindent--;
4633a7c4 3957#endif
02db2b7b
IZ
3958
3959#if 0 /* Breaks $^R */
3960 if (unwind)
3961 regcpblow(firstcp);
3962#endif
4633a7c4
LW
3963 return 1;
3964
3965no:
7821416a
IZ
3966 DEBUG_r(
3967 PerlIO_printf(Perl_debug_log,
3968 "%*s %sfailed...%s\n",
3969 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3970 );
3971 goto do_no;
3972no_final:
3973do_no:
02db2b7b
IZ
3974 if (unwind) {
3975 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3976
3977 switch (uw->type) {
3978 case RE_UNWIND_BRANCH:
3979 case RE_UNWIND_BRANCHJ:
3980 {
3981 re_unwind_branch_t *uwb = &(uw->branch);
3982 I32 lastparen = uwb->lastparen;
9041c2e3 3983
02db2b7b
IZ
3984 REGCP_UNWIND(uwb->lastcp);
3985 for (n = *PL_reglastparen; n > lastparen; n--)
3986 PL_regendp[n] = -1;
3987 *PL_reglastparen = n;
3988 scan = next = uwb->next;
9041c2e3
NIS
3989 if ( !scan ||
3990 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3991 ? BRANCH : BRANCHJ) ) { /* Failure */
3992 unwind = uwb->prev;
3993#ifdef DEBUGGING
3994 PL_regindent--;
3995#endif
3996 goto do_no;
3997 }
3998 /* Have more choice yet. Reuse the same uwb. */
3999 /*SUPPRESS 560*/
4000 if ((n = (uwb->type == RE_UNWIND_BRANCH
4001 ? NEXT_OFF(next) : ARG(next))))
4002 next += n;
4003 else
4004 next = NULL; /* XXXX Needn't unwinding in this case... */
4005 uwb->next = next;
4006 next = NEXTOPER(scan);
4007 if (uwb->type == RE_UNWIND_BRANCHJ)
4008 next = NEXTOPER(next);
4009 locinput = uwb->locinput;
4010 nextchr = uwb->nextchr;
4011#ifdef DEBUGGING
4012 PL_regindent = uwb->regindent;
4013#endif
4014
4015 goto reenter;
4016 }
4017 /* NOT REACHED */
4018 default:
4019 Perl_croak(aTHX_ "regexp unwind memory corruption");
4020 }
4021 /* NOT REACHED */
4022 }
4633a7c4 4023#ifdef DEBUGGING
3280af22 4024 PL_regindent--;
4633a7c4 4025#endif
a0d0e21e 4026 return 0;
a687059c
LW
4027}
4028
4029/*
4030 - regrepeat - repeatedly match something simple, report how many
4031 */
4032/*
4033 * [This routine now assumes that it will only match on things of length 1.
4034 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4035 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4036 */
76e3520e 4037STATIC I32
cea2e8a9 4038S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4039{
a0d0e21e 4040 register char *scan;
a0d0e21e 4041 register I32 c;
3280af22 4042 register char *loceol = PL_regeol;
a0ed51b3 4043 register I32 hardcount = 0;
53c4c00c 4044 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4045
3280af22 4046 scan = PL_reginput;
faf11cac
HS
4047 if (max == REG_INFTY)
4048 max = I32_MAX;
4049 else if (max < loceol - scan)
a0d0e21e 4050 loceol = scan + max;
a0d0e21e 4051 switch (OP(p)) {
22c35a8c 4052 case REG_ANY:
1aa99e6b 4053 if (do_utf8) {
ffc61ed2 4054 loceol = PL_regeol;
1aa99e6b 4055 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4056 scan += UTF8SKIP(scan);
4057 hardcount++;
4058 }
4059 } else {
4060 while (scan < loceol && *scan != '\n')
4061 scan++;
a0ed51b3
LW
4062 }
4063 break;
ffc61ed2 4064 case SANY:
def8e4ea
JH
4065 if (do_utf8) {
4066 loceol = PL_regeol;
a0804c9e 4067 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4068 scan += UTF8SKIP(scan);
4069 hardcount++;
4070 }
4071 }
4072 else
4073 scan = loceol;
a0ed51b3 4074 break;
f33976b4
DB
4075 case CANY:
4076 scan = loceol;
4077 break;
bbce6d69 4078 case EXACT: /* length of string is 1 */
cd439c50 4079 c = (U8)*STRING(p);
bbce6d69 4080 while (scan < loceol && UCHARAT(scan) == c)
4081 scan++;
4082 break;
4083 case EXACTF: /* length of string is 1 */
cd439c50 4084 c = (U8)*STRING(p);
bbce6d69 4085 while (scan < loceol &&
22c35a8c 4086 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4087 scan++;
4088 break;
4089 case EXACTFL: /* length of string is 1 */
3280af22 4090 PL_reg_flags |= RF_tainted;
cd439c50 4091 c = (U8)*STRING(p);
bbce6d69 4092 while (scan < loceol &&
22c35a8c 4093 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4094 scan++;
4095 break;
4096 case ANYOF:
ffc61ed2
JH
4097 if (do_utf8) {
4098 loceol = PL_regeol;
cfc92286
JH
4099 while (hardcount < max && scan < loceol &&
4100 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4101 scan += UTF8SKIP(scan);
4102 hardcount++;
4103 }
4104 } else {
7d3e948e 4105 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4106 scan++;
4107 }
a0d0e21e
LW
4108 break;
4109 case ALNUM:
1aa99e6b 4110 if (do_utf8) {
ffc61ed2 4111 loceol = PL_regeol;
8269fa76 4112 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4113 while (hardcount < max && scan < loceol &&
3568d838 4114 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4115 scan += UTF8SKIP(scan);
4116 hardcount++;
4117 }
4118 } else {
4119 while (scan < loceol && isALNUM(*scan))
4120 scan++;
a0ed51b3
LW
4121 }
4122 break;
bbce6d69 4123 case ALNUML:
3280af22 4124 PL_reg_flags |= RF_tainted;
1aa99e6b 4125 if (do_utf8) {
ffc61ed2 4126 loceol = PL_regeol;
1aa99e6b
IH
4127 while (hardcount < max && scan < loceol &&
4128 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4129 scan += UTF8SKIP(scan);
4130 hardcount++;
4131 }
4132 } else {
4133 while (scan < loceol && isALNUM_LC(*scan))
4134 scan++;
a0ed51b3
LW
4135 }
4136 break;
a0d0e21e 4137 case NALNUM:
1aa99e6b 4138 if (do_utf8) {
ffc61ed2 4139 loceol = PL_regeol;
8269fa76 4140 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4141 while (hardcount < max && scan < loceol &&
3568d838 4142 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4143 scan += UTF8SKIP(scan);
4144 hardcount++;
4145 }
4146 } else {
4147 while (scan < loceol && !isALNUM(*scan))
4148 scan++;
a0ed51b3
LW
4149 }
4150 break;
bbce6d69 4151 case NALNUML:
3280af22 4152 PL_reg_flags |= RF_tainted;
1aa99e6b 4153 if (do_utf8) {
ffc61ed2 4154 loceol = PL_regeol;
1aa99e6b
IH
4155 while (hardcount < max && scan < loceol &&
4156 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4157 scan += UTF8SKIP(scan);
4158 hardcount++;
4159 }
4160 } else {
4161 while (scan < loceol && !isALNUM_LC(*scan))
4162 scan++;
a0ed51b3
LW
4163 }
4164 break;
a0d0e21e 4165 case SPACE:
1aa99e6b 4166 if (do_utf8) {
ffc61ed2 4167 loceol = PL_regeol;
8269fa76 4168 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4169 while (hardcount < max && scan < loceol &&
3568d838
JH
4170 (*scan == ' ' ||
4171 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4172 scan += UTF8SKIP(scan);
4173 hardcount++;
4174 }
4175 } else {
4176 while (scan < loceol && isSPACE(*scan))
4177 scan++;
a0ed51b3
LW
4178 }
4179 break;
bbce6d69 4180 case SPACEL:
3280af22 4181 PL_reg_flags |= RF_tainted;
1aa99e6b 4182 if (do_utf8) {
ffc61ed2 4183 loceol = PL_regeol;
1aa99e6b 4184 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4185 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4186 scan += UTF8SKIP(scan);
4187 hardcount++;
4188 }
4189 } else {
4190 while (scan < loceol && isSPACE_LC(*scan))
4191 scan++;
a0ed51b3
LW
4192 }
4193 break;
a0d0e21e 4194 case NSPACE:
1aa99e6b 4195 if (do_utf8) {
ffc61ed2 4196 loceol = PL_regeol;
8269fa76 4197 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4198 while (hardcount < max && scan < loceol &&
3568d838
JH
4199 !(*scan == ' ' ||
4200 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4201 scan += UTF8SKIP(scan);
4202 hardcount++;
4203 }
4204 } else {
4205 while (scan < loceol && !isSPACE(*scan))
4206 scan++;
4207 break;
a0ed51b3 4208 }
bbce6d69 4209 case NSPACEL:
3280af22 4210 PL_reg_flags |= RF_tainted;
1aa99e6b 4211 if (do_utf8) {
ffc61ed2 4212 loceol = PL_regeol;
1aa99e6b 4213 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4214 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4215 scan += UTF8SKIP(scan);
4216 hardcount++;
4217 }
4218 } else {
4219 while (scan < loceol && !isSPACE_LC(*scan))
4220 scan++;
a0ed51b3
LW
4221 }
4222 break;
a0d0e21e 4223 case DIGIT:
1aa99e6b 4224 if (do_utf8) {
ffc61ed2 4225 loceol = PL_regeol;
8269fa76 4226 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4227 while (hardcount < max && scan < loceol &&
3568d838 4228 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4229 scan += UTF8SKIP(scan);
4230 hardcount++;
4231 }
4232 } else {
4233 while (scan < loceol && isDIGIT(*scan))
4234 scan++;
a0ed51b3
LW
4235 }
4236 break;
a0d0e21e 4237 case NDIGIT:
1aa99e6b 4238 if (do_utf8) {
ffc61ed2 4239 loceol = PL_regeol;
8269fa76 4240 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4241 while (hardcount < max && scan < loceol &&
3568d838 4242 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4243 scan += UTF8SKIP(scan);
4244 hardcount++;
4245 }
4246 } else {
4247 while (scan < loceol && !isDIGIT(*scan))
4248 scan++;
a0ed51b3
LW
4249 }
4250 break;
a0d0e21e
LW
4251 default: /* Called on something of 0 width. */
4252 break; /* So match right here or not at all. */
4253 }
a687059c 4254
a0ed51b3
LW
4255 if (hardcount)
4256 c = hardcount;
4257 else
4258 c = scan - PL_reginput;
3280af22 4259 PL_reginput = scan;
a687059c 4260
9041c2e3 4261 DEBUG_r(
c277df42
IZ
4262 {
4263 SV *prop = sv_newmortal();
4264
4265 regprop(prop, p);
9041c2e3
NIS
4266 PerlIO_printf(Perl_debug_log,
4267 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4268 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4269 });
9041c2e3 4270
a0d0e21e 4271 return(c);
a687059c
LW
4272}
4273
4274/*
c277df42 4275 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4276 *
c277df42
IZ
4277 * The repeater is supposed to have constant length.
4278 */
4279
76e3520e 4280STATIC I32
cea2e8a9 4281S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4282{
b7953727 4283 register char *scan = Nullch;
c277df42 4284 register char *start;
3280af22 4285 register char *loceol = PL_regeol;
a0ed51b3 4286 I32 l = 0;
708e3b05 4287 I32 count = 0, res = 1;
a0ed51b3
LW
4288
4289 if (!max)
4290 return 0;
c277df42 4291
3280af22 4292 start = PL_reginput;
53c4c00c 4293 if (PL_reg_match_utf8) {
708e3b05 4294 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4295 if (!count++) {
4296 l = 0;
4297 while (start < PL_reginput) {
4298 l++;
4299 start += UTF8SKIP(start);
4300 }
4301 *lp = l;
4302 if (l == 0)
4303 return max;
4304 }
4305 if (count == max)
4306 return count;
4307 }
4308 }
4309 else {
708e3b05 4310 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4311 if (!count++) {
4312 *lp = l = PL_reginput - start;
4313 if (max != REG_INFTY && l*max < loceol - scan)
4314 loceol = scan + l*max;
4315 if (l == 0)
4316 return max;
c277df42
IZ
4317 }
4318 }
4319 }
708e3b05 4320 if (!res)
3280af22 4321 PL_reginput = scan;
9041c2e3 4322
a0ed51b3 4323 return count;
c277df42
IZ
4324}
4325
4326/*
ffc61ed2
JH
4327- regclass_swash - prepare the utf8 swash
4328*/
4329
4330SV *
9e55ce06 4331Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4332{
9e55ce06
JH
4333 SV *sw = NULL;
4334 SV *si = NULL;
4335 SV *alt = NULL;
ffc61ed2
JH
4336
4337 if (PL_regdata && PL_regdata->count) {
4338 U32 n = ARG(node);
4339
4340 if (PL_regdata->what[n] == 's') {
4341 SV *rv = (SV*)PL_regdata->data[n];
4342 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4343 SV **ary = AvARRAY(av);
9e55ce06 4344 SV **a, **b;
9041c2e3 4345
9e55ce06
JH
4346 /* See the end of regcomp.c:S_reglass() for
4347 * documentation of these array elements. */
4348
b11f357e
JH
4349 si = *ary;
4350 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4351 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4352
ffc61ed2
JH
4353 if (a)
4354 sw = *a;
4355 else if (si && doinit) {
4356 sw = swash_init("utf8", "", si, 1, 0);
4357 (void)av_store(av, 1, sw);
4358 }
9e55ce06
JH
4359 if (b)
4360 alt = *b;
ffc61ed2
JH
4361 }
4362 }
4363
9e55ce06
JH
4364 if (listsvp)
4365 *listsvp = si;
4366 if (altsvp)
4367 *altsvp = alt;
ffc61ed2
JH
4368
4369 return sw;
4370}
4371
4372/*
ba7b4546 4373 - reginclass - determine if a character falls into a character class
832705d4
JH
4374
4375 The n is the ANYOF regnode, the p is the target string, lenp
4376 is pointer to the maximum length of how far to go in the p
4377 (if the lenp is zero, UTF8SKIP(p) is used),
4378 do_utf8 tells whether the target string is in UTF-8.
4379
bbce6d69 4380 */
4381
76e3520e 4382STATIC bool
ba7b4546 4383S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4384{
ffc61ed2 4385 char flags = ANYOF_FLAGS(n);
bbce6d69 4386 bool match = FALSE;
cc07378b 4387 UV c = *p;
ae9ddab8 4388 STRLEN len = 0;
9e55ce06 4389 STRLEN plen;
1aa99e6b 4390
ae9ddab8
JH
4391 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4392 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4393 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4394
0f0076b4 4395 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4396 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4397 if (lenp)
4398 *lenp = 0;
ffc61ed2 4399 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4400 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4401 match = TRUE;
bbce6d69 4402 }
3568d838 4403 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4404 match = TRUE;
ffc61ed2 4405 if (!match) {
9e55ce06
JH
4406 AV *av;
4407 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4408
4409 if (sw) {
3568d838 4410 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4411 match = TRUE;
4412 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4413 if (!match && lenp && av) {
4414 I32 i;
4415
4416 for (i = 0; i <= av_len(av); i++) {
4417 SV* sv = *av_fetch(av, i, FALSE);
4418 STRLEN len;
4419 char *s = SvPV(sv, len);
4420
061b10df 4421 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4422 *lenp = len;
4423 match = TRUE;
4424 break;
4425 }
4426 }
4427 }
4428 if (!match) {
4a623e43
JH
4429 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4430 STRLEN tmplen;
4431
9e55ce06
JH
4432 to_utf8_fold(p, tmpbuf, &tmplen);
4433 if (swash_fetch(sw, tmpbuf, do_utf8))
4434 match = TRUE;
4435 }
ffc61ed2
JH
4436 }
4437 }
bbce6d69 4438 }
9e55ce06 4439 if (match && lenp && *lenp == 0)
0f0076b4 4440 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4441 }
1aa99e6b 4442 if (!match && c < 256) {
ffc61ed2
JH
4443 if (ANYOF_BITMAP_TEST(n, c))
4444 match = TRUE;
4445 else if (flags & ANYOF_FOLD) {
eb160463 4446 U8 f;
a0ed51b3 4447
ffc61ed2
JH
4448 if (flags & ANYOF_LOCALE) {
4449 PL_reg_flags |= RF_tainted;
4450 f = PL_fold_locale[c];
4451 }
4452 else
4453 f = PL_fold[c];
4454 if (f != c && ANYOF_BITMAP_TEST(n, f))
4455 match = TRUE;
4456 }
4457
4458 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4459 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4460 if (
4461 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4481 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4482 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4483 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4484 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4485 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4486 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4487 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4488 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4489 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4490 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4491 ) /* How's that for a conditional? */
4492 {
4493 match = TRUE;
4494 }
a0ed51b3 4495 }
a0ed51b3
LW
4496 }
4497
a0ed51b3
LW
4498 return (flags & ANYOF_INVERT) ? !match : match;
4499}
161b471a 4500
dfe13c55 4501STATIC U8 *
cea2e8a9 4502S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4503{
1aa99e6b
IH
4504 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4505}
4506
4507STATIC U8 *
4508S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4509{
a0ed51b3 4510 if (off >= 0) {
1aa99e6b 4511 while (off-- && s < lim) {
ffc61ed2 4512 /* XXX could check well-formedness here */
a0ed51b3 4513 s += UTF8SKIP(s);
ffc61ed2 4514 }
a0ed51b3
LW
4515 }
4516 else {
4517 while (off++) {
1aa99e6b 4518 if (s > lim) {
a0ed51b3 4519 s--;
ffc61ed2 4520 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4521 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4522 s--;
ffc61ed2
JH
4523 }
4524 /* XXX could check well-formedness here */
a0ed51b3
LW
4525 }
4526 }
4527 }
4528 return s;
4529}
161b471a 4530
dfe13c55 4531STATIC U8 *
1aa99e6b 4532S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4533{
1aa99e6b
IH
4534 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4535}
4536
4537STATIC U8 *
4538S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4539{
4540 if (off >= 0) {
1aa99e6b 4541 while (off-- && s < lim) {
ffc61ed2 4542 /* XXX could check well-formedness here */
a0ed51b3 4543 s += UTF8SKIP(s);
ffc61ed2 4544 }
a0ed51b3
LW
4545 if (off >= 0)
4546 return 0;
4547 }
4548 else {
4549 while (off++) {
1aa99e6b 4550 if (s > lim) {
a0ed51b3 4551 s--;
ffc61ed2 4552 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4553 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4554 s--;
ffc61ed2
JH
4555 }
4556 /* XXX could check well-formedness here */
a0ed51b3
LW
4557 }
4558 else
4559 break;
4560 }
4561 if (off <= 0)
4562 return 0;
4563 }
4564 return s;
4565}
51371543 4566
51371543 4567static void
acfe0abc 4568restore_pos(pTHX_ void *arg)
51371543 4569{
51371543
GS
4570 if (PL_reg_eval_set) {
4571 if (PL_reg_oldsaved) {
4572 PL_reg_re->subbeg = PL_reg_oldsaved;
4573 PL_reg_re->sublen = PL_reg_oldsavedlen;
ed252734
NC
4574#ifdef PERL_COPY_ON_WRITE
4575 PL_reg_re->saved_copy = PL_nrs;
4576#endif
51371543
GS
4577 RX_MATCH_COPIED_on(PL_reg_re);
4578 }
4579 PL_reg_magic->mg_len = PL_reg_oldpos;
4580 PL_reg_eval_set = 0;
4581 PL_curpm = PL_reg_oldcurpm;
4582 }
4583}
33b8afdf
JH
4584
4585STATIC void
4586S_to_utf8_substr(pTHX_ register regexp *prog)
4587{
4588 SV* sv;
4589 if (prog->float_substr && !prog->float_utf8) {
4590 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4591 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4592 sv_utf8_upgrade(sv);
4593 if (SvTAIL(prog->float_substr))
4594 SvTAIL_on(sv);
4595 if (prog->float_substr == prog->check_substr)
4596 prog->check_utf8 = sv;
4597 }
4598 if (prog->anchored_substr && !prog->anchored_utf8) {
4599 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4600 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4601 sv_utf8_upgrade(sv);
4602 if (SvTAIL(prog->anchored_substr))
4603 SvTAIL_on(sv);
4604 if (prog->anchored_substr == prog->check_substr)
4605 prog->check_utf8 = sv;
4606 }
4607}
4608
4609STATIC void
4610S_to_byte_substr(pTHX_ register regexp *prog)
4611{
4612 SV* sv;
4613 if (prog->float_utf8 && !prog->float_substr) {
4614 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4615 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4616 if (sv_utf8_downgrade(sv, TRUE)) {
4617 if (SvTAIL(prog->float_utf8))
4618 SvTAIL_on(sv);
4619 } else {
4620 SvREFCNT_dec(sv);
4621 prog->float_substr = sv = &PL_sv_undef;
4622 }
4623 if (prog->float_utf8 == prog->check_utf8)
4624 prog->check_substr = sv;
4625 }
4626 if (prog->anchored_utf8 && !prog->anchored_substr) {
4627 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4628 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4629 if (sv_utf8_downgrade(sv, TRUE)) {
4630 if (SvTAIL(prog->anchored_utf8))
4631 SvTAIL_on(sv);
4632 } else {
4633 SvREFCNT_dec(sv);
4634 prog->anchored_substr = sv = &PL_sv_undef;
4635 }
4636 if (prog->anchored_utf8 == prog->check_utf8)
4637 prog->check_substr = sv;
4638 }
4639}