This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [perl #21765] $s = ""; $n = @a = split(/,/, $s); results in undef $n
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
4c79ee7a 70 **** Copyright (c) 1991-2003, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
eb160463 90#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
ba7b4546 99#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 100
c277df42
IZ
101/*
102 * Forwards.
103 */
104
33b8afdf 105#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 106#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 107
dfe13c55
GS
108#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
109#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
110#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
111#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
112#define HOPc(pos,off) ((char*)HOP(pos,off))
113#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 114
efb30f32 115#define HOPBACK(pos, off) ( \
e54858b0 116 (PL_reg_match_utf8) \
efb30f32
HS
117 ? reghopmaybe((U8*)pos, -off) \
118 : (pos - off >= PL_bostr) \
119 ? (U8*)(pos - off) \
120 : (U8*)NULL \
121)
122#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
123
1aa99e6b
IH
124#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
125#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
127#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
130
977e4b84 131#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 132
5f80c4cf 133/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
134#define JUMPABLE(rn) ( \
135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
137 OP(rn) == PLUS || OP(rn) == MINMOD || \
138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
139)
140
cca55fe3
JP
141#define HAS_TEXT(rn) ( \
142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
143)
e2d8ce26 144
a84d97b6
HS
145/*
146 Search for mandatory following text node; for lookahead, the text must
147 follow but for lookbehind (rn->flags != 0) we skip to the next step.
148*/
cca55fe3 149#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 150 while (JUMPABLE(rn)) \
a84d97b6 151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 152 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
153 else if (OP(rn) == PLUS) \
154 rn = NEXTOPER(rn); \
a84d97b6
HS
155 else if (OP(rn) == IFMATCH) \
156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 157 else rn += NEXT_OFF(rn); \
5f80c4cf 158} STMT_END
74750237 159
acfe0abc 160static void restore_pos(pTHX_ void *arg);
51371543 161
76e3520e 162STATIC CHECKPOINT
cea2e8a9 163S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 164{
3280af22 165 int retval = PL_savestack_ix;
b1ce53c5
JH
166#define REGCP_PAREN_ELEMS 4
167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
168 int p;
169
e49a9654
IH
170 if (paren_elems_to_push < 0)
171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
172
a01268b5 173#define REGCP_OTHER_ELEMS 6
b1ce53c5 174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 175 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 176/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
177 SSPUSHINT(PL_regendp[p]);
178 SSPUSHINT(PL_regstartp[p]);
3280af22 179 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
180 SSPUSHINT(p);
181 }
b1ce53c5 182/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
183 SSPUSHINT(PL_regsize);
184 SSPUSHINT(*PL_reglastparen);
a01268b5 185 SSPUSHINT(*PL_reglastcloseparen);
3280af22 186 SSPUSHPTR(PL_reginput);
41123dfd
JH
187#define REGCP_FRAME_ELEMS 2
188/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
189 * are needed for the regexp context stack bookkeeping. */
190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 192
a0d0e21e
LW
193 return retval;
194}
195
c277df42 196/* These are needed since we do not localize EVAL nodes: */
02db2b7b 197# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 199 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 200
02db2b7b 201# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 202 PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 205
76e3520e 206STATIC char *
cea2e8a9 207S_regcppop(pTHX)
a0d0e21e 208{
b1ce53c5 209 I32 i;
a0d0e21e
LW
210 U32 paren = 0;
211 char *input;
cf93c79d 212 I32 tmps;
b1ce53c5
JH
213
214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 215 i = SSPOPINT;
b1ce53c5
JH
216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
217 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 218 input = (char *) SSPOPPTR;
a01268b5 219 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
220 *PL_reglastparen = SSPOPINT;
221 PL_regsize = SSPOPINT;
b1ce53c5
JH
222
223 /* Now restore the parentheses context. */
41123dfd
JH
224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
225 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 226 paren = (U32)SSPOPINT;
3280af22 227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
228 PL_regstartp[paren] = SSPOPINT;
229 tmps = SSPOPINT;
3280af22
NIS
230 if (paren <= *PL_reglastparen)
231 PL_regendp[paren] = tmps;
c277df42 232 DEBUG_r(
c3464db5 233 PerlIO_printf(Perl_debug_log,
b900a521 234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 235 (UV)paren, (IV)PL_regstartp[paren],
b900a521 236 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 237 (IV)PL_regendp[paren],
3280af22 238 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 239 );
a0d0e21e 240 }
c277df42 241 DEBUG_r(
eb160463 242 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 243 PerlIO_printf(Perl_debug_log,
faccc32b
JH
244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
246 }
247 );
daf18116 248#if 1
dafc8851
JH
249 /* It would seem that the similar code in regtry()
250 * already takes care of this, and in fact it is in
251 * a better location to since this code can #if 0-ed out
252 * but the code in regtry() is needed or otherwise tests
253 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
254 * (as of patchlevel 7877) will fail. Then again,
255 * this code seems to be necessary or otherwise
256 * building DynaLoader will fail:
257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
258 * --jhi */
eb160463
GS
259 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
260 if ((I32)paren > PL_regsize)
cf93c79d
IZ
261 PL_regstartp[paren] = -1;
262 PL_regendp[paren] = -1;
a0d0e21e 263 }
dafc8851 264#endif
a0d0e21e
LW
265 return input;
266}
267
0f5d15d6 268STATIC char *
cea2e8a9 269S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
270{
271 I32 tmp = PL_savestack_ix;
272
273 PL_savestack_ix = ss;
274 regcppop();
275 PL_savestack_ix = tmp;
942e002e 276 return Nullch;
0f5d15d6
IZ
277}
278
279typedef struct re_cc_state
280{
281 I32 ss;
282 regnode *node;
283 struct re_cc_state *prev;
284 CURCUR *cc;
285 regexp *re;
286} re_cc_state;
287
02db2b7b 288#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 289
29d1e993
HS
290#define TRYPAREN(paren, n, input) { \
291 if (paren) { \
292 if (n) { \
293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
294 PL_regendp[paren] = input - PL_bostr; \
295 } \
296 else \
297 PL_regendp[paren] = -1; \
298 } \
299 if (regmatch(next)) \
300 sayYES; \
301 if (paren && n) \
302 PL_regendp[paren] = -1; \
303}
304
305
a687059c 306/*
e50aee73 307 * pregexec and friends
a687059c
LW
308 */
309
310/*
c277df42 311 - pregexec - match a regexp against a string
a687059c 312 */
c277df42 313I32
864dbfa3 314Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 315 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
316/* strend: pointer to null at end of string */
317/* strbeg: real beginning of string */
318/* minend: end of match must be >=minend after stringarg. */
319/* nosave: For optimizations. */
320{
321 return
9041c2e3 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
323 nosave ? 0 : REXEC_COPY_STR);
324}
0f5d15d6
IZ
325
326STATIC void
cea2e8a9 327S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
328{
329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
330#ifdef DEBUGGING
331 PL_regprogram = prog->program;
332#endif
333 PL_regnpar = prog->nparens;
9041c2e3
NIS
334 PL_regdata = prog->data;
335 PL_reg_re = prog;
0f5d15d6 336}
22e551b9 337
9041c2e3 338/*
cad2e5aa
JH
339 * Need to implement the following flags for reg_anch:
340 *
341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
342 * USE_INTUIT_ML
343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
344 * INTUIT_AUTORITATIVE_ML
345 * INTUIT_ONCE_NOML - Intuit can match in one location only.
346 * INTUIT_ONCE_ML
347 *
348 * Another flag for this function: SECOND_TIME (so that float substrs
349 * with giant delta may be not rechecked).
350 */
351
352/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
353
2c2d71f5 354/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
355 Otherwise, only SvCUR(sv) is used to get strbeg. */
356
357/* XXXX We assume that strpos is strbeg unless sv. */
358
6eb5f6b9
JH
359/* XXXX Some places assume that there is a fixed substring.
360 An update may be needed if optimizer marks as "INTUITable"
361 RExen without fixed substrings. Similarly, it is assumed that
362 lengths of all the strings are no more than minlen, thus they
363 cannot come from lookahead.
364 (Or minlen should take into account lookahead.) */
365
2c2d71f5
JH
366/* A failure to find a constant substring means that there is no need to make
367 an expensive call to REx engine, thus we celebrate a failure. Similarly,
368 finding a substring too deep into the string means that less calls to
30944b6d
IZ
369 regtry() should be needed.
370
371 REx compiler's optimizer found 4 possible hints:
372 a) Anchored substring;
373 b) Fixed substring;
374 c) Whether we are anchored (beginning-of-line or \G);
375 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 376 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
377 string which does not contradict any of them.
378 */
2c2d71f5 379
6eb5f6b9
JH
380/* Most of decisions we do here should have been done at compile time.
381 The nodes of the REx which we used for the search should have been
382 deleted from the finite automaton. */
383
cad2e5aa
JH
384char *
385Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
386 char *strend, U32 flags, re_scream_pos_data *data)
387{
b7953727 388 register I32 start_shift = 0;
cad2e5aa 389 /* Should be nonnegative! */
b7953727 390 register I32 end_shift = 0;
2c2d71f5
JH
391 register char *s;
392 register SV *check;
a1933d95 393 char *strbeg;
cad2e5aa 394 char *t;
33b8afdf 395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 396 I32 ml_anch;
6eb5f6b9 397 register char *other_last = Nullch; /* other substr checked before this */
b7953727 398 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
399#ifdef DEBUGGING
400 char *i_strpos = strpos;
ce333219 401 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 402#endif
a30b2f1f 403 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 404
b8d68ded
JH
405 if (prog->reganch & ROPT_UTF8) {
406 DEBUG_r(PerlIO_printf(Perl_debug_log,
407 "UTF-8 regex...\n"));
408 PL_reg_flags |= RF_utf8;
409 }
410
2a782b5b 411 DEBUG_r({
b8d68ded 412 char *s = PL_reg_match_utf8 ?
c728cb41
JH
413 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
414 strpos;
b8d68ded
JH
415 int len = PL_reg_match_utf8 ?
416 strlen(s) : strend - strpos;
2a782b5b
JH
417 if (!PL_colorset)
418 reginitcolors();
b8d68ded
JH
419 if (PL_reg_match_utf8)
420 DEBUG_r(PerlIO_printf(Perl_debug_log,
421 "UTF-8 target...\n"));
2a782b5b
JH
422 PerlIO_printf(Perl_debug_log,
423 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
424 PL_colors[4],PL_colors[5],PL_colors[0],
425 prog->precomp,
426 PL_colors[1],
427 (strlen(prog->precomp) > 60 ? "..." : ""),
428 PL_colors[0],
429 (int)(len > 60 ? 60 : len),
430 s, PL_colors[1],
431 (len > 60 ? "..." : "")
432 );
433 });
cad2e5aa 434
c344f387
JH
435 /* CHR_DIST() would be more correct here but it makes things slow. */
436 if (prog->minlen > strend - strpos) {
a72c7584
JH
437 DEBUG_r(PerlIO_printf(Perl_debug_log,
438 "String too short... [re_intuit_start]\n"));
cad2e5aa 439 goto fail;
2c2d71f5 440 }
a1933d95 441 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 442 PL_regeol = strend;
33b8afdf
JH
443 if (do_utf8) {
444 if (!prog->check_utf8 && prog->check_substr)
445 to_utf8_substr(prog);
446 check = prog->check_utf8;
447 } else {
448 if (!prog->check_substr && prog->check_utf8)
449 to_byte_substr(prog);
450 check = prog->check_substr;
451 }
452 if (check == &PL_sv_undef) {
453 DEBUG_r(PerlIO_printf(Perl_debug_log,
454 "Non-utf string cannot match utf check string\n"));
455 goto fail;
456 }
2c2d71f5 457 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
458 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
459 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 460 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 461
7e25d62c
JH
462 if (!ml_anch) {
463 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
464 | ROPT_IMPLICIT)) /* not a real BOL */
465 /* SvCUR is not set on references: SvRV and SvPVX overlap */
466 && sv && !SvROK(sv)
467 && (strpos != strbeg)) {
468 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
469 goto fail;
470 }
471 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 472 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 473 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
474 I32 slen;
475
1aa99e6b 476 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
477 if (SvTAIL(check)) {
478 slen = SvCUR(check); /* >= 1 */
cad2e5aa 479
9041c2e3 480 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
481 || (strend - s == slen && strend[-1] != '\n')) {
482 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
483 goto fail_finish;
cad2e5aa
JH
484 }
485 /* Now should match s[0..slen-2] */
486 slen--;
653099ff 487 if (slen && (*SvPVX(check) != *s
cad2e5aa 488 || (slen > 1
653099ff 489 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
490 report_neq:
491 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
492 goto fail_finish;
493 }
cad2e5aa 494 }
653099ff
GS
495 else if (*SvPVX(check) != *s
496 || ((slen = SvCUR(check)) > 1
497 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
498 goto report_neq;
499 goto success_at_start;
7e25d62c 500 }
cad2e5aa 501 }
2c2d71f5 502 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 503 s = strpos;
2c2d71f5 504 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 505 end_shift = prog->minlen - start_shift -
653099ff 506 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 507 if (!ml_anch) {
653099ff
GS
508 I32 end = prog->check_offset_max + CHR_SVLEN(check)
509 - (SvTAIL(check) != 0);
1aa99e6b 510 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
511
512 if (end_shift < eshift)
513 end_shift = eshift;
514 }
cad2e5aa 515 }
2c2d71f5 516 else { /* Can match at random position */
cad2e5aa
JH
517 ml_anch = 0;
518 s = strpos;
2c2d71f5
JH
519 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
520 /* Should be nonnegative! */
521 end_shift = prog->minlen - start_shift -
653099ff 522 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
523 }
524
2c2d71f5 525#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 526 if (end_shift < 0)
6bbae5e6 527 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
528#endif
529
2c2d71f5
JH
530 restart:
531 /* Find a possible match in the region s..strend by looking for
532 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 533 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
534 I32 p = -1; /* Internal iterator of scream. */
535 I32 *pp = data ? data->scream_pos : &p;
536
2c2d71f5
JH
537 if (PL_screamfirst[BmRARE(check)] >= 0
538 || ( BmRARE(check) == '\n'
539 && (BmPREVIOUS(check) == SvCUR(check) - 1)
540 && SvTAIL(check) ))
9041c2e3 541 s = screaminstr(sv, check,
2c2d71f5 542 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 543 else
2c2d71f5 544 goto fail_finish;
4addbd3b
HS
545 /* we may be pointing at the wrong string */
546 if (s && RX_MATCH_COPIED(prog))
547 s = prog->subbeg + (s - SvPVX(sv));
cad2e5aa
JH
548 if (data)
549 *data->scream_olds = s;
550 }
f33976b4 551 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
552 s = fbm_instr((U8*)(s + start_shift),
553 (U8*)(strend - end_shift),
554 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 555 else
1aa99e6b
IH
556 s = fbm_instr(HOP3(s, start_shift, strend),
557 HOP3(strend, -end_shift, strbeg),
2c2d71f5 558 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
559
560 /* Update the count-of-usability, remove useless subpatterns,
561 unshift s. */
2c2d71f5
JH
562
563 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
564 (s ? "Found" : "Did not find"),
33b8afdf 565 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 566 PL_colors[0],
7b0972df
JH
567 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
568 SvPVX(check),
2c2d71f5
JH
569 PL_colors[1], (SvTAIL(check) ? "$" : ""),
570 (s ? " at offset " : "...\n") ) );
571
572 if (!s)
573 goto fail_finish;
574
6eb5f6b9
JH
575 check_at = s;
576
2c2d71f5 577 /* Finish the diagnostic message */
30944b6d 578 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
579
580 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
581 Start with the other substr.
582 XXXX no SCREAM optimization yet - and a very coarse implementation
583 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
584 *always* match. Probably should be marked during compile...
585 Probably it is right to do no SCREAM here...
586 */
587
33b8afdf 588 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 589 /* Take into account the "other" substring. */
2c2d71f5
JH
590 /* XXXX May be hopelessly wrong for UTF... */
591 if (!other_last)
6eb5f6b9 592 other_last = strpos;
33b8afdf 593 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
594 do_other_anchored:
595 {
1aa99e6b 596 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 597 char *s1 = s;
33b8afdf 598 SV* must;
2c2d71f5 599
2c2d71f5
JH
600 t = s - prog->check_offset_max;
601 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 602 && (!do_utf8
1aa99e6b 603 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 604 && t > strpos)))
30944b6d 605 /* EMPTY */;
2c2d71f5
JH
606 else
607 t = strpos;
1aa99e6b 608 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
609 if (t < other_last) /* These positions already checked */
610 t = other_last;
1aa99e6b 611 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
612 if (last < last1)
613 last1 = last;
614 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
615 /* On end-of-str: see comment below. */
33b8afdf
JH
616 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
617 if (must == &PL_sv_undef) {
618 s = (char*)NULL;
619 DEBUG_r(must = prog->anchored_utf8); /* for debug */
620 }
621 else
622 s = fbm_instr(
623 (unsigned char*)t,
624 HOP3(HOP3(last1, prog->anchored_offset, strend)
625 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
626 must,
627 PL_multiline ? FBMrf_MULTILINE : 0
628 );
1aa99e6b
IH
629 DEBUG_r(PerlIO_printf(Perl_debug_log,
630 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
631 (s ? "Found" : "Contradicts"),
632 PL_colors[0],
33b8afdf
JH
633 (int)(SvCUR(must)
634 - (SvTAIL(must)!=0)),
635 SvPVX(must),
636 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
637 if (!s) {
638 if (last1 >= last2) {
639 DEBUG_r(PerlIO_printf(Perl_debug_log,
640 ", giving up...\n"));
641 goto fail_finish;
642 }
643 DEBUG_r(PerlIO_printf(Perl_debug_log,
644 ", trying floating at offset %ld...\n",
1aa99e6b
IH
645 (long)(HOP3c(s1, 1, strend) - i_strpos)));
646 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
647 s = HOP3c(last, 1, strend);
2c2d71f5
JH
648 goto restart;
649 }
650 else {
651 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 652 (long)(s - i_strpos)));
1aa99e6b
IH
653 t = HOP3c(s, -prog->anchored_offset, strbeg);
654 other_last = HOP3c(s, 1, strend);
30944b6d 655 s = s1;
2c2d71f5
JH
656 if (t == strpos)
657 goto try_at_start;
2c2d71f5
JH
658 goto try_at_offset;
659 }
30944b6d 660 }
2c2d71f5
JH
661 }
662 else { /* Take into account the floating substring. */
33b8afdf
JH
663 char *last, *last1;
664 char *s1 = s;
665 SV* must;
666
667 t = HOP3c(s, -start_shift, strbeg);
668 last1 = last =
669 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
670 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
671 last = HOP3c(t, prog->float_max_offset, strend);
672 s = HOP3c(t, prog->float_min_offset, strend);
673 if (s < other_last)
674 s = other_last;
2c2d71f5 675 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
676 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
677 /* fbm_instr() takes into account exact value of end-of-str
678 if the check is SvTAIL(ed). Since false positives are OK,
679 and end-of-str is not later than strend we are OK. */
680 if (must == &PL_sv_undef) {
681 s = (char*)NULL;
682 DEBUG_r(must = prog->float_utf8); /* for debug message */
683 }
684 else
2c2d71f5 685 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
686 (unsigned char*)last + SvCUR(must)
687 - (SvTAIL(must)!=0),
688 must, PL_multiline ? FBMrf_MULTILINE : 0);
689 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
690 (s ? "Found" : "Contradicts"),
691 PL_colors[0],
692 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
693 SvPVX(must),
694 PL_colors[1], (SvTAIL(must) ? "$" : "")));
695 if (!s) {
696 if (last1 == last) {
2c2d71f5 697 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
698 ", giving up...\n"));
699 goto fail_finish;
2c2d71f5 700 }
33b8afdf
JH
701 DEBUG_r(PerlIO_printf(Perl_debug_log,
702 ", trying anchored starting at offset %ld...\n",
703 (long)(s1 + 1 - i_strpos)));
704 other_last = last;
705 s = HOP3c(t, 1, strend);
706 goto restart;
707 }
708 else {
709 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
710 (long)(s - i_strpos)));
711 other_last = s; /* Fix this later. --Hugo */
712 s = s1;
713 if (t == strpos)
714 goto try_at_start;
715 goto try_at_offset;
716 }
2c2d71f5 717 }
cad2e5aa 718 }
2c2d71f5
JH
719
720 t = s - prog->check_offset_max;
2c2d71f5 721 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 722 && (!do_utf8
1aa99e6b
IH
723 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
724 && t > strpos))) {
2c2d71f5
JH
725 /* Fixed substring is found far enough so that the match
726 cannot start at strpos. */
727 try_at_offset:
cad2e5aa 728 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
729 /* Eventually fbm_*() should handle this, but often
730 anchored_offset is not 0, so this check will not be wasted. */
731 /* XXXX In the code below we prefer to look for "^" even in
732 presence of anchored substrings. And we search even
733 beyond the found float position. These pessimizations
734 are historical artefacts only. */
735 find_anchor:
2c2d71f5 736 while (t < strend - prog->minlen) {
cad2e5aa 737 if (*t == '\n') {
4ee3650e 738 if (t < check_at - prog->check_offset_min) {
33b8afdf 739 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
740 /* Since we moved from the found position,
741 we definitely contradict the found anchored
30944b6d
IZ
742 substr. Due to the above check we do not
743 contradict "check" substr.
744 Thus we can arrive here only if check substr
745 is float. Redo checking for "other"=="fixed".
746 */
9041c2e3 747 strpos = t + 1;
30944b6d
IZ
748 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
749 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
750 goto do_other_anchored;
751 }
4ee3650e
GS
752 /* We don't contradict the found floating substring. */
753 /* XXXX Why not check for STCLASS? */
cad2e5aa 754 s = t + 1;
2c2d71f5 755 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 756 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
757 goto set_useful;
758 }
4ee3650e
GS
759 /* Position contradicts check-string */
760 /* XXXX probably better to look for check-string
761 than for "\n", so one should lower the limit for t? */
762 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 763 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 764 other_last = strpos = s = t + 1;
cad2e5aa
JH
765 goto restart;
766 }
767 t++;
768 }
2c2d71f5
JH
769 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
770 PL_colors[0],PL_colors[1]));
771 goto fail_finish;
cad2e5aa 772 }
f5952150
GS
773 else {
774 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
775 PL_colors[0],PL_colors[1]));
776 }
cad2e5aa
JH
777 s = t;
778 set_useful:
33b8afdf 779 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
780 }
781 else {
f5952150 782 /* The found string does not prohibit matching at strpos,
2c2d71f5 783 - no optimization of calling REx engine can be performed,
f5952150
GS
784 unless it was an MBOL and we are not after MBOL,
785 or a future STCLASS check will fail this. */
2c2d71f5
JH
786 try_at_start:
787 /* Even in this situation we may use MBOL flag if strpos is offset
788 wrt the start of the string. */
05b4157f 789 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 790 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
791 /* May be due to an implicit anchor of m{.*foo} */
792 && !(prog->reganch & ROPT_IMPLICIT))
793 {
cad2e5aa
JH
794 t = strpos;
795 goto find_anchor;
796 }
30944b6d 797 DEBUG_r( if (ml_anch)
f5952150
GS
798 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
799 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 800 );
2c2d71f5 801 success_at_start:
30944b6d 802 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
803 && (do_utf8 ? (
804 prog->check_utf8 /* Could be deleted already */
805 && --BmUSEFUL(prog->check_utf8) < 0
806 && (prog->check_utf8 == prog->float_utf8)
807 ) : (
808 prog->check_substr /* Could be deleted already */
809 && --BmUSEFUL(prog->check_substr) < 0
810 && (prog->check_substr == prog->float_substr)
811 )))
66e933ab 812 {
cad2e5aa 813 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 814 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
815 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
816 if (do_utf8 ? prog->check_substr : prog->check_utf8)
817 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
818 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
819 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 820 check = Nullsv; /* abort */
cad2e5aa 821 s = strpos;
3cf5c195
IZ
822 /* XXXX This is a remnant of the old implementation. It
823 looks wasteful, since now INTUIT can use many
6eb5f6b9 824 other heuristics. */
cad2e5aa
JH
825 prog->reganch &= ~RE_USE_INTUIT;
826 }
827 else
828 s = strpos;
829 }
830
6eb5f6b9
JH
831 /* Last resort... */
832 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
833 if (prog->regstclass) {
834 /* minlen == 0 is possible if regstclass is \b or \B,
835 and the fixed substr is ''$.
836 Since minlen is already taken into account, s+1 is before strend;
837 accidentally, minlen >= 1 guaranties no false positives at s + 1
838 even for \b or \B. But (minlen? 1 : 0) below assumes that
839 regstclass does not come from lookahead... */
840 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
841 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 842 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 843 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 844 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 845 : 1);
33b8afdf 846 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 847 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 848 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
849 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
850 cl_l, strend)
851 : strend);
a1933d95 852 char *startpos = strbeg;
6eb5f6b9
JH
853
854 t = s;
9926ca43 855 cache_re(prog);
f33976b4 856 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
857 if (!s) {
858#ifdef DEBUGGING
b7953727 859 char *what = 0;
6eb5f6b9
JH
860#endif
861 if (endpos == strend) {
862 DEBUG_r( PerlIO_printf(Perl_debug_log,
863 "Could not match STCLASS...\n") );
864 goto fail;
865 }
66e933ab
GS
866 DEBUG_r( PerlIO_printf(Perl_debug_log,
867 "This position contradicts STCLASS...\n") );
653099ff
GS
868 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
869 goto fail;
6eb5f6b9 870 /* Contradict one of substrings */
33b8afdf
JH
871 if (prog->anchored_substr || prog->anchored_utf8) {
872 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
873 DEBUG_r( what = "anchored" );
874 hop_and_restart:
1aa99e6b 875 s = HOP3c(t, 1, strend);
66e933ab
GS
876 if (s + start_shift + end_shift > strend) {
877 /* XXXX Should be taken into account earlier? */
878 DEBUG_r( PerlIO_printf(Perl_debug_log,
879 "Could not match STCLASS...\n") );
880 goto fail;
881 }
5e39e1e5
HS
882 if (!check)
883 goto giveup;
6eb5f6b9 884 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 885 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
886 what, (long)(s + start_shift - i_strpos)) );
887 goto restart;
888 }
66e933ab 889 /* Have both, check_string is floating */
6eb5f6b9
JH
890 if (t + start_shift >= check_at) /* Contradicts floating=check */
891 goto retry_floating_check;
892 /* Recheck anchored substring, but not floating... */
9041c2e3 893 s = check_at;
5e39e1e5
HS
894 if (!check)
895 goto giveup;
6eb5f6b9 896 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 897 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
898 (long)(other_last - i_strpos)) );
899 goto do_other_anchored;
900 }
60e71179
GS
901 /* Another way we could have checked stclass at the
902 current position only: */
903 if (ml_anch) {
904 s = t = t + 1;
5e39e1e5
HS
905 if (!check)
906 goto giveup;
60e71179 907 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
908 "Looking for /%s^%s/m starting at offset %ld...\n",
909 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 910 goto try_at_offset;
66e933ab 911 }
33b8afdf 912 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 913 goto fail;
6eb5f6b9
JH
914 /* Check is floating subtring. */
915 retry_floating_check:
916 t = check_at - start_shift;
917 DEBUG_r( what = "floating" );
918 goto hop_and_restart;
919 }
b7953727
JH
920 if (t != s) {
921 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 922 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
923 (long)(t - i_strpos), (long)(s - i_strpos))
924 );
925 }
926 else {
927 DEBUG_r(PerlIO_printf(Perl_debug_log,
928 "Does not contradict STCLASS...\n");
929 );
930 }
6eb5f6b9 931 }
5e39e1e5
HS
932 giveup:
933 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
934 PL_colors[4], (check ? "Guessed" : "Giving up"),
935 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 936 return s;
2c2d71f5
JH
937
938 fail_finish: /* Substring not found */
33b8afdf
JH
939 if (prog->check_substr || prog->check_utf8) /* could be removed already */
940 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 941 fail:
2c2d71f5 942 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
943 PL_colors[4],PL_colors[5]));
944 return Nullch;
945}
9661b544 946
6eb5f6b9 947/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
948STATIC char *
949S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 950{
6eb5f6b9
JH
951 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
952 char *m;
d8093b23
G
953 STRLEN ln;
954 unsigned int c1;
955 unsigned int c2;
6eb5f6b9
JH
956 char *e;
957 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 958 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 959
6eb5f6b9
JH
960 /* We know what class it must start with. */
961 switch (OP(c)) {
6eb5f6b9 962 case ANYOF:
388cc4de
HS
963 if (do_utf8) {
964 while (s < strend) {
965 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
966 !UTF8_IS_INVARIANT((U8)s[0]) ?
967 reginclass(c, (U8*)s, 0, do_utf8) :
968 REGINCLASS(c, (U8*)s)) {
969 if (tmp && (norun || regtry(prog, s)))
970 goto got_it;
971 else
972 tmp = doevery;
973 }
974 else
975 tmp = 1;
976 s += UTF8SKIP(s);
977 }
978 }
979 else {
980 while (s < strend) {
981 STRLEN skip = 1;
982
983 if (REGINCLASS(c, (U8*)s) ||
984 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
985 /* The assignment of 2 is intentional:
986 * for the folded sharp s, the skip is 2. */
987 (skip = SHARP_S_SKIP))) {
988 if (tmp && (norun || regtry(prog, s)))
989 goto got_it;
990 else
991 tmp = doevery;
992 }
993 else
994 tmp = 1;
995 s += skip;
996 }
a0d0e21e 997 }
6eb5f6b9 998 break;
f33976b4
DB
999 case CANY:
1000 while (s < strend) {
1001 if (tmp && (norun || regtry(prog, s)))
1002 goto got_it;
1003 else
1004 tmp = doevery;
1005 s++;
1006 }
1007 break;
6eb5f6b9
JH
1008 case EXACTF:
1009 m = STRING(c);
1010 ln = STR_LEN(c);
1aa99e6b 1011 if (UTF) {
a2a2844f 1012 STRLEN ulen1, ulen2;
e7ae6809
JH
1013 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1014 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1015
1016 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1017 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1018
872c91ae
JH
1019 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1020 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1021 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1022 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1023 }
1024 else {
1025 c1 = *(U8*)m;
1026 c2 = PL_fold[c1];
1027 }
6eb5f6b9
JH
1028 goto do_exactf;
1029 case EXACTFL:
1030 m = STRING(c);
1031 ln = STR_LEN(c);
d8093b23 1032 c1 = *(U8*)m;
6eb5f6b9
JH
1033 c2 = PL_fold_locale[c1];
1034 do_exactf:
6795a346 1035 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1036
6eb5f6b9
JH
1037 if (norun && e < s)
1038 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1039
60a8b682
JH
1040 /* The idea in the EXACTF* cases is to first find the
1041 * first character of the EXACTF* node and then, if
1042 * necessary, case-insensitively compare the full
1043 * text of the node. The c1 and c2 are the first
1044 * characters (though in Unicode it gets a bit
1045 * more complicated because there are more cases
7f16dd3d
JH
1046 * than just upper and lower: one needs to use
1047 * the so-called folding case for case-insensitive
1048 * matching (called "loose matching" in Unicode).
1049 * ibcmp_utf8() will do just that. */
60a8b682 1050
1aa99e6b 1051 if (do_utf8) {
575cac57
JH
1052 UV c, f;
1053 U8 tmpbuf [UTF8_MAXLEN+1];
1054 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1055 STRLEN len, foldlen;
d7f013c8 1056
09091399 1057 if (c1 == c2) {
1aa99e6b 1058 while (s <= e) {
872c91ae
JH
1059 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1060 ckWARN(WARN_UTF8) ?
1061 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1062 if ( c == c1
1063 && (ln == len ||
66423254 1064 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1065 m, (char **)0, ln, (bool)UTF))
55da9344 1066 && (norun || regtry(prog, s)) )
1aa99e6b 1067 goto got_it;
80aecb99
JH
1068 else {
1069 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1070 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1071 if ( f != c
1072 && (f == c1 || f == c2)
1073 && (ln == foldlen ||
66423254
JH
1074 !ibcmp_utf8((char *) foldbuf,
1075 (char **)0, foldlen, do_utf8,
d07ddd77 1076 m,
eb160463 1077 (char **)0, ln, (bool)UTF))
80aecb99
JH
1078 && (norun || regtry(prog, s)) )
1079 goto got_it;
1080 }
1aa99e6b
IH
1081 s += len;
1082 }
09091399
JH
1083 }
1084 else {
1aa99e6b 1085 while (s <= e) {
872c91ae
JH
1086 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1087 ckWARN(WARN_UTF8) ?
1088 0 : UTF8_ALLOW_ANY);
80aecb99 1089
60a8b682 1090 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1091 * Note that not all the possible combinations
1092 * are handled here: some of them are handled
1093 * by the standard folding rules, and some of
1094 * them (the character class or ANYOF cases)
1095 * are handled during compiletime in
1096 * regexec.c:S_regclass(). */
880bd946
JH
1097 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1098 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1099 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1100
1101 if ( (c == c1 || c == c2)
1102 && (ln == len ||
66423254 1103 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1104 m, (char **)0, ln, (bool)UTF))
55da9344 1105 && (norun || regtry(prog, s)) )
1aa99e6b 1106 goto got_it;
80aecb99
JH
1107 else {
1108 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1109 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1110 if ( f != c
1111 && (f == c1 || f == c2)
1112 && (ln == foldlen ||
a6872d42 1113 !ibcmp_utf8((char *) foldbuf,
66423254 1114 (char **)0, foldlen, do_utf8,
d07ddd77 1115 m,
eb160463 1116 (char **)0, ln, (bool)UTF))
80aecb99
JH
1117 && (norun || regtry(prog, s)) )
1118 goto got_it;
1119 }
1aa99e6b
IH
1120 s += len;
1121 }
09091399 1122 }
1aa99e6b
IH
1123 }
1124 else {
1125 if (c1 == c2)
1126 while (s <= e) {
1127 if ( *(U8*)s == c1
1128 && (ln == 1 || !(OP(c) == EXACTF
1129 ? ibcmp(s, m, ln)
1130 : ibcmp_locale(s, m, ln)))
1131 && (norun || regtry(prog, s)) )
1132 goto got_it;
1133 s++;
1134 }
1135 else
1136 while (s <= e) {
1137 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1138 && (ln == 1 || !(OP(c) == EXACTF
1139 ? ibcmp(s, m, ln)
1140 : ibcmp_locale(s, m, ln)))
1141 && (norun || regtry(prog, s)) )
1142 goto got_it;
1143 s++;
1144 }
b3c9acc1
IZ
1145 }
1146 break;
bbce6d69 1147 case BOUNDL:
3280af22 1148 PL_reg_flags |= RF_tainted;
bbce6d69 1149 /* FALL THROUGH */
a0d0e21e 1150 case BOUND:
ffc61ed2 1151 if (do_utf8) {
12d33761 1152 if (s == PL_bostr)
ffc61ed2
JH
1153 tmp = '\n';
1154 else {
b4f7163a 1155 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1156
b4f7163a 1157 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1158 }
1159 tmp = ((OP(c) == BOUND ?
9041c2e3 1160 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1161 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1162 while (s < strend) {
1163 if (tmp == !(OP(c) == BOUND ?
3568d838 1164 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1165 isALNUM_LC_utf8((U8*)s)))
1166 {
1167 tmp = !tmp;
1168 if ((norun || regtry(prog, s)))
1169 goto got_it;
1170 }
1171 s += UTF8SKIP(s);
a687059c 1172 }
a0d0e21e 1173 }
667bb95a 1174 else {
12d33761 1175 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1176 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1177 while (s < strend) {
1178 if (tmp ==
1179 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1180 tmp = !tmp;
1181 if ((norun || regtry(prog, s)))
1182 goto got_it;
1183 }
1184 s++;
a0ed51b3 1185 }
a0ed51b3 1186 }
6eb5f6b9 1187 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1188 goto got_it;
1189 break;
bbce6d69 1190 case NBOUNDL:
3280af22 1191 PL_reg_flags |= RF_tainted;
bbce6d69 1192 /* FALL THROUGH */
a0d0e21e 1193 case NBOUND:
ffc61ed2 1194 if (do_utf8) {
12d33761 1195 if (s == PL_bostr)
ffc61ed2
JH
1196 tmp = '\n';
1197 else {
b4f7163a 1198 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1199
b4f7163a 1200 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1201 }
1202 tmp = ((OP(c) == NBOUND ?
9041c2e3 1203 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1204 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1205 while (s < strend) {
1206 if (tmp == !(OP(c) == NBOUND ?
3568d838 1207 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1208 isALNUM_LC_utf8((U8*)s)))
1209 tmp = !tmp;
1210 else if ((norun || regtry(prog, s)))
1211 goto got_it;
1212 s += UTF8SKIP(s);
1213 }
a0d0e21e 1214 }
667bb95a 1215 else {
12d33761 1216 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1217 tmp = ((OP(c) == NBOUND ?
1218 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1219 while (s < strend) {
1220 if (tmp ==
1221 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1222 tmp = !tmp;
1223 else if ((norun || regtry(prog, s)))
1224 goto got_it;
1225 s++;
1226 }
a0ed51b3 1227 }
6eb5f6b9 1228 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1229 goto got_it;
1230 break;
a0d0e21e 1231 case ALNUM:
ffc61ed2 1232 if (do_utf8) {
8269fa76 1233 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1234 while (s < strend) {
3568d838 1235 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1236 if (tmp && (norun || regtry(prog, s)))
1237 goto got_it;
1238 else
1239 tmp = doevery;
1240 }
bbce6d69 1241 else
ffc61ed2
JH
1242 tmp = 1;
1243 s += UTF8SKIP(s);
bbce6d69 1244 }
bbce6d69 1245 }
ffc61ed2
JH
1246 else {
1247 while (s < strend) {
1248 if (isALNUM(*s)) {
1249 if (tmp && (norun || regtry(prog, s)))
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
a0ed51b3 1254 else
ffc61ed2
JH
1255 tmp = 1;
1256 s++;
a0ed51b3 1257 }
a0ed51b3
LW
1258 }
1259 break;
bbce6d69 1260 case ALNUML:
3280af22 1261 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1262 if (do_utf8) {
1263 while (s < strend) {
1264 if (isALNUM_LC_utf8((U8*)s)) {
1265 if (tmp && (norun || regtry(prog, s)))
1266 goto got_it;
1267 else
1268 tmp = doevery;
1269 }
a687059c 1270 else
ffc61ed2
JH
1271 tmp = 1;
1272 s += UTF8SKIP(s);
a0d0e21e 1273 }
a0d0e21e 1274 }
ffc61ed2
JH
1275 else {
1276 while (s < strend) {
1277 if (isALNUM_LC(*s)) {
1278 if (tmp && (norun || regtry(prog, s)))
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
a0ed51b3 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s++;
a0ed51b3 1286 }
a0ed51b3
LW
1287 }
1288 break;
a0d0e21e 1289 case NALNUM:
ffc61ed2 1290 if (do_utf8) {
8269fa76 1291 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1292 while (s < strend) {
3568d838 1293 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1294 if (tmp && (norun || regtry(prog, s)))
1295 goto got_it;
1296 else
1297 tmp = doevery;
1298 }
bbce6d69 1299 else
ffc61ed2
JH
1300 tmp = 1;
1301 s += UTF8SKIP(s);
bbce6d69 1302 }
bbce6d69 1303 }
ffc61ed2
JH
1304 else {
1305 while (s < strend) {
1306 if (!isALNUM(*s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a0ed51b3 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s++;
a0ed51b3 1315 }
a0ed51b3
LW
1316 }
1317 break;
bbce6d69 1318 case NALNUML:
3280af22 1319 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1320 if (do_utf8) {
1321 while (s < strend) {
1322 if (!isALNUM_LC_utf8((U8*)s)) {
1323 if (tmp && (norun || regtry(prog, s)))
1324 goto got_it;
1325 else
1326 tmp = doevery;
1327 }
a687059c 1328 else
ffc61ed2
JH
1329 tmp = 1;
1330 s += UTF8SKIP(s);
a687059c 1331 }
a0d0e21e 1332 }
ffc61ed2
JH
1333 else {
1334 while (s < strend) {
1335 if (!isALNUM_LC(*s)) {
1336 if (tmp && (norun || regtry(prog, s)))
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0ed51b3 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s++;
a0ed51b3 1344 }
a0ed51b3
LW
1345 }
1346 break;
a0d0e21e 1347 case SPACE:
ffc61ed2 1348 if (do_utf8) {
8269fa76 1349 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1350 while (s < strend) {
3568d838 1351 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1352 if (tmp && (norun || regtry(prog, s)))
1353 goto got_it;
1354 else
1355 tmp = doevery;
1356 }
a0d0e21e 1357 else
ffc61ed2
JH
1358 tmp = 1;
1359 s += UTF8SKIP(s);
2304df62 1360 }
a0d0e21e 1361 }
ffc61ed2
JH
1362 else {
1363 while (s < strend) {
1364 if (isSPACE(*s)) {
1365 if (tmp && (norun || regtry(prog, s)))
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
a0ed51b3 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s++;
a0ed51b3 1373 }
a0ed51b3
LW
1374 }
1375 break;
bbce6d69 1376 case SPACEL:
3280af22 1377 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1378 if (do_utf8) {
1379 while (s < strend) {
1380 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1381 if (tmp && (norun || regtry(prog, s)))
1382 goto got_it;
1383 else
1384 tmp = doevery;
1385 }
bbce6d69 1386 else
ffc61ed2
JH
1387 tmp = 1;
1388 s += UTF8SKIP(s);
bbce6d69 1389 }
bbce6d69 1390 }
ffc61ed2
JH
1391 else {
1392 while (s < strend) {
1393 if (isSPACE_LC(*s)) {
1394 if (tmp && (norun || regtry(prog, s)))
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0ed51b3 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s++;
a0ed51b3 1402 }
a0ed51b3
LW
1403 }
1404 break;
a0d0e21e 1405 case NSPACE:
ffc61ed2 1406 if (do_utf8) {
8269fa76 1407 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1408 while (s < strend) {
3568d838 1409 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1410 if (tmp && (norun || regtry(prog, s)))
1411 goto got_it;
1412 else
1413 tmp = doevery;
1414 }
a0d0e21e 1415 else
ffc61ed2
JH
1416 tmp = 1;
1417 s += UTF8SKIP(s);
a687059c 1418 }
a0d0e21e 1419 }
ffc61ed2
JH
1420 else {
1421 while (s < strend) {
1422 if (!isSPACE(*s)) {
1423 if (tmp && (norun || regtry(prog, s)))
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
a0ed51b3 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s++;
a0ed51b3 1431 }
a0ed51b3
LW
1432 }
1433 break;
bbce6d69 1434 case NSPACEL:
3280af22 1435 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1436 if (do_utf8) {
1437 while (s < strend) {
1438 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1439 if (tmp && (norun || regtry(prog, s)))
1440 goto got_it;
1441 else
1442 tmp = doevery;
1443 }
bbce6d69 1444 else
ffc61ed2
JH
1445 tmp = 1;
1446 s += UTF8SKIP(s);
bbce6d69 1447 }
bbce6d69 1448 }
ffc61ed2
JH
1449 else {
1450 while (s < strend) {
1451 if (!isSPACE_LC(*s)) {
1452 if (tmp && (norun || regtry(prog, s)))
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0ed51b3 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s++;
a0ed51b3 1460 }
a0ed51b3
LW
1461 }
1462 break;
a0d0e21e 1463 case DIGIT:
ffc61ed2 1464 if (do_utf8) {
8269fa76 1465 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1466 while (s < strend) {
3568d838 1467 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1468 if (tmp && (norun || regtry(prog, s)))
1469 goto got_it;
1470 else
1471 tmp = doevery;
1472 }
a0d0e21e 1473 else
ffc61ed2
JH
1474 tmp = 1;
1475 s += UTF8SKIP(s);
2b69d0c2 1476 }
a0d0e21e 1477 }
ffc61ed2
JH
1478 else {
1479 while (s < strend) {
1480 if (isDIGIT(*s)) {
1481 if (tmp && (norun || regtry(prog, s)))
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
a0ed51b3 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s++;
a0ed51b3 1489 }
a0ed51b3
LW
1490 }
1491 break;
b8c5462f
JH
1492 case DIGITL:
1493 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1494 if (do_utf8) {
1495 while (s < strend) {
1496 if (isDIGIT_LC_utf8((U8*)s)) {
1497 if (tmp && (norun || regtry(prog, s)))
1498 goto got_it;
1499 else
1500 tmp = doevery;
1501 }
b8c5462f 1502 else
ffc61ed2
JH
1503 tmp = 1;
1504 s += UTF8SKIP(s);
b8c5462f 1505 }
b8c5462f 1506 }
ffc61ed2
JH
1507 else {
1508 while (s < strend) {
1509 if (isDIGIT_LC(*s)) {
1510 if (tmp && (norun || regtry(prog, s)))
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
b8c5462f 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s++;
b8c5462f 1518 }
b8c5462f
JH
1519 }
1520 break;
a0d0e21e 1521 case NDIGIT:
ffc61ed2 1522 if (do_utf8) {
8269fa76 1523 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1524 while (s < strend) {
3568d838 1525 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1526 if (tmp && (norun || regtry(prog, s)))
1527 goto got_it;
1528 else
1529 tmp = doevery;
1530 }
a0d0e21e 1531 else
ffc61ed2
JH
1532 tmp = 1;
1533 s += UTF8SKIP(s);
a687059c 1534 }
a0d0e21e 1535 }
ffc61ed2
JH
1536 else {
1537 while (s < strend) {
1538 if (!isDIGIT(*s)) {
1539 if (tmp && (norun || regtry(prog, s)))
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
a0ed51b3 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s++;
a0ed51b3 1547 }
a0ed51b3
LW
1548 }
1549 break;
b8c5462f
JH
1550 case NDIGITL:
1551 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1552 if (do_utf8) {
1553 while (s < strend) {
1554 if (!isDIGIT_LC_utf8((U8*)s)) {
1555 if (tmp && (norun || regtry(prog, s)))
1556 goto got_it;
1557 else
1558 tmp = doevery;
1559 }
b8c5462f 1560 else
ffc61ed2
JH
1561 tmp = 1;
1562 s += UTF8SKIP(s);
b8c5462f 1563 }
a0ed51b3 1564 }
ffc61ed2
JH
1565 else {
1566 while (s < strend) {
1567 if (!isDIGIT_LC(*s)) {
1568 if (tmp && (norun || regtry(prog, s)))
1569 goto got_it;
1570 else
1571 tmp = doevery;
1572 }
cf93c79d 1573 else
ffc61ed2
JH
1574 tmp = 1;
1575 s++;
b8c5462f 1576 }
b8c5462f
JH
1577 }
1578 break;
b3c9acc1 1579 default:
3c3eec57
GS
1580 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1581 break;
d6a28714 1582 }
6eb5f6b9
JH
1583 return 0;
1584 got_it:
1585 return s;
1586}
1587
1588/*
1589 - regexec_flags - match a regexp against a string
1590 */
1591I32
1592Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1593 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1594/* strend: pointer to null at end of string */
1595/* strbeg: real beginning of string */
1596/* minend: end of match must be >=minend after stringarg. */
1597/* data: May be used for some additional optimizations. */
1598/* nosave: For optimizations. */
1599{
6eb5f6b9
JH
1600 register char *s;
1601 register regnode *c;
1602 register char *startpos = stringarg;
6eb5f6b9
JH
1603 I32 minlen; /* must match at least this many chars */
1604 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1605 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1606 constant substr. */ /* CC */
1607 I32 end_shift = 0; /* Same for the end. */ /* CC */
1608 I32 scream_pos = -1; /* Internal iterator of scream. */
1609 char *scream_olds;
1610 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1611 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1612#ifdef DEBUGGING
9e55ce06
JH
1613 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1614 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1615#endif
a30b2f1f 1616 RX_MATCH_UTF8_set(prog,do_utf8);
6eb5f6b9
JH
1617
1618 PL_regcc = 0;
1619
1620 cache_re(prog);
1621#ifdef DEBUGGING
aea4f609 1622 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1623#endif
1624
1625 /* Be paranoid... */
1626 if (prog == NULL || startpos == NULL) {
1627 Perl_croak(aTHX_ "NULL regexp parameter");
1628 return 0;
1629 }
1630
1631 minlen = prog->minlen;
61a36c01 1632 if (strend - startpos < minlen) {
a72c7584
JH
1633 DEBUG_r(PerlIO_printf(Perl_debug_log,
1634 "String too short [regexec_flags]...\n"));
1635 goto phooey;
1aa99e6b 1636 }
6eb5f6b9 1637
6eb5f6b9
JH
1638 /* Check validity of program. */
1639 if (UCHARAT(prog->program) != REG_MAGIC) {
1640 Perl_croak(aTHX_ "corrupted regexp program");
1641 }
1642
1643 PL_reg_flags = 0;
1644 PL_reg_eval_set = 0;
1645 PL_reg_maxiter = 0;
1646
1647 if (prog->reganch & ROPT_UTF8)
1648 PL_reg_flags |= RF_utf8;
1649
1650 /* Mark beginning of line for ^ and lookbehind. */
1651 PL_regbol = startpos;
1652 PL_bostr = strbeg;
1653 PL_reg_sv = sv;
1654
1655 /* Mark end of line for $ (and such) */
1656 PL_regeol = strend;
1657
1658 /* see how far we have to get to not match where we matched before */
1659 PL_regtill = startpos+minend;
1660
1661 /* We start without call_cc context. */
1662 PL_reg_call_cc = 0;
1663
1664 /* If there is a "must appear" string, look for it. */
1665 s = startpos;
1666
1667 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1668 MAGIC *mg;
1669
1670 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1671 PL_reg_ganch = startpos;
1672 else if (sv && SvTYPE(sv) >= SVt_PVMG
1673 && SvMAGIC(sv)
14befaf4
DM
1674 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1675 && mg->mg_len >= 0) {
6eb5f6b9
JH
1676 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1677 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1678 if (s > PL_reg_ganch)
6eb5f6b9
JH
1679 goto phooey;
1680 s = PL_reg_ganch;
1681 }
1682 }
1683 else /* pos() not defined */
1684 PL_reg_ganch = strbeg;
1685 }
1686
33b8afdf 1687 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1688 re_scream_pos_data d;
1689
1690 d.scream_olds = &scream_olds;
1691 d.scream_pos = &scream_pos;
1692 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1693 if (!s) {
1694 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1695 goto phooey; /* not present */
3fa9c3d7 1696 }
6eb5f6b9
JH
1697 }
1698
2a782b5b 1699 DEBUG_r({
9e55ce06
JH
1700 char *s0 = UTF ?
1701 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1702 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1703 prog->precomp;
1704 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1705 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1706 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1707 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1708 if (!PL_colorset)
1709 reginitcolors();
1710 PerlIO_printf(Perl_debug_log,
9e55ce06 1711 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1712 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1713 len0, len0, s0,
2a782b5b 1714 PL_colors[1],
9e55ce06 1715 len0 > 60 ? "..." : "",
2a782b5b 1716 PL_colors[0],
9e55ce06
JH
1717 (int)(len1 > 60 ? 60 : len1),
1718 s1, PL_colors[1],
1719 (len1 > 60 ? "..." : "")
2a782b5b
JH
1720 );
1721 });
6eb5f6b9
JH
1722
1723 /* Simplest case: anchored match need be tried only once. */
1724 /* [unless only anchor is BOL and multiline is set] */
1725 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1726 if (s == startpos && regtry(prog, startpos))
1727 goto got_it;
1728 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1729 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1730 {
1731 char *end;
1732
1733 if (minlen)
1734 dontbother = minlen - 1;
1aa99e6b 1735 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1736 /* for multiline we only have to try after newlines */
33b8afdf 1737 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1738 if (s == startpos)
1739 goto after_try;
1740 while (1) {
1741 if (regtry(prog, s))
1742 goto got_it;
1743 after_try:
1744 if (s >= end)
1745 goto phooey;
1746 if (prog->reganch & RE_USE_INTUIT) {
1747 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1748 if (!s)
1749 goto phooey;
1750 }
1751 else
1752 s++;
1753 }
1754 } else {
1755 if (s > startpos)
1756 s--;
1757 while (s < end) {
1758 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1759 if (regtry(prog, s))
1760 goto got_it;
1761 }
1762 }
1763 }
1764 }
1765 goto phooey;
1766 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1767 if (regtry(prog, PL_reg_ganch))
1768 goto got_it;
1769 goto phooey;
1770 }
1771
1772 /* Messy cases: unanchored match. */
33b8afdf 1773 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1774 /* we have /x+whatever/ */
1775 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1776 char ch;
bf93d4cc
GS
1777#ifdef DEBUGGING
1778 int did_match = 0;
1779#endif
33b8afdf
JH
1780 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1781 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1782 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1783
1aa99e6b 1784 if (do_utf8) {
6eb5f6b9
JH
1785 while (s < strend) {
1786 if (*s == ch) {
bf93d4cc 1787 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1788 if (regtry(prog, s)) goto got_it;
1789 s += UTF8SKIP(s);
1790 while (s < strend && *s == ch)
1791 s += UTF8SKIP(s);
1792 }
1793 s += UTF8SKIP(s);
1794 }
1795 }
1796 else {
1797 while (s < strend) {
1798 if (*s == ch) {
bf93d4cc 1799 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1800 if (regtry(prog, s)) goto got_it;
1801 s++;
1802 while (s < strend && *s == ch)
1803 s++;
1804 }
1805 s++;
1806 }
1807 }
b7953727 1808 DEBUG_r(if (!did_match)
bf93d4cc 1809 PerlIO_printf(Perl_debug_log,
b7953727
JH
1810 "Did not find anchored character...\n")
1811 );
6eb5f6b9
JH
1812 }
1813 /*SUPPRESS 560*/
33b8afdf
JH
1814 else if (prog->anchored_substr != Nullsv
1815 || prog->anchored_utf8 != Nullsv
1816 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1817 && prog->float_max_offset < strend - s)) {
1818 SV *must;
1819 I32 back_max;
1820 I32 back_min;
1821 char *last;
6eb5f6b9 1822 char *last1; /* Last position checked before */
bf93d4cc
GS
1823#ifdef DEBUGGING
1824 int did_match = 0;
1825#endif
33b8afdf
JH
1826 if (prog->anchored_substr || prog->anchored_utf8) {
1827 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1828 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1829 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1830 back_max = back_min = prog->anchored_offset;
1831 } else {
1832 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1833 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1834 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1835 back_max = prog->float_max_offset;
1836 back_min = prog->float_min_offset;
1837 }
1838 if (must == &PL_sv_undef)
1839 /* could not downgrade utf8 check substring, so must fail */
1840 goto phooey;
1841
1842 last = HOP3c(strend, /* Cannot start after this */
1843 -(I32)(CHR_SVLEN(must)
1844 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1845
1846 if (s > PL_bostr)
1847 last1 = HOPc(s, -1);
1848 else
1849 last1 = s - 1; /* bogus */
1850
1851 /* XXXX check_substr already used to find `s', can optimize if
1852 check_substr==must. */
1853 scream_pos = -1;
1854 dontbother = end_shift;
1855 strend = HOPc(strend, -dontbother);
1856 while ( (s <= last) &&
9041c2e3 1857 ((flags & REXEC_SCREAM)
1aa99e6b 1858 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1859 end_shift, &scream_pos, 0))
1aa99e6b 1860 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1861 (unsigned char*)strend, must,
6eb5f6b9 1862 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1863 /* we may be pointing at the wrong string */
1864 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1865 s = prog->subbeg + (s - SvPVX(sv));
bf93d4cc 1866 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1867 if (HOPc(s, -back_max) > last1) {
1868 last1 = HOPc(s, -back_min);
1869 s = HOPc(s, -back_max);
1870 }
1871 else {
1872 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1873
1874 last1 = HOPc(s, -back_min);
1875 s = t;
1876 }
1aa99e6b 1877 if (do_utf8) {
6eb5f6b9
JH
1878 while (s <= last1) {
1879 if (regtry(prog, s))
1880 goto got_it;
1881 s += UTF8SKIP(s);
1882 }
1883 }
1884 else {
1885 while (s <= last1) {
1886 if (regtry(prog, s))
1887 goto got_it;
1888 s++;
1889 }
1890 }
1891 }
b7953727
JH
1892 DEBUG_r(if (!did_match)
1893 PerlIO_printf(Perl_debug_log,
1894 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1895 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1896 ? "anchored" : "floating"),
1897 PL_colors[0],
1898 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1899 SvPVX(must),
b7953727
JH
1900 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1901 );
6eb5f6b9
JH
1902 goto phooey;
1903 }
155aba94 1904 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1905 if (minlen) {
1906 I32 op = (U8)OP(prog->regstclass);
66e933ab 1907 /* don't bother with what can't match */
f14c76ed
RGS
1908 if (PL_regkind[op] != EXACT && op != CANY)
1909 strend = HOPc(strend, -(minlen - 1));
1910 }
ffc61ed2
JH
1911 DEBUG_r({
1912 SV *prop = sv_newmortal();
9e55ce06
JH
1913 char *s0;
1914 char *s1;
1915 int len0;
1916 int len1;
1917
ffc61ed2 1918 regprop(prop, c);
9e55ce06
JH
1919 s0 = UTF ?
1920 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1921 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1922 SvPVX(prop);
1923 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1924 s1 = UTF ?
c728cb41 1925 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1926 len1 = UTF ? SvCUR(dsv1) : strend - s;
1927 PerlIO_printf(Perl_debug_log,
1928 "Matching stclass `%*.*s' against `%*.*s'\n",
1929 len0, len0, s0,
1930 len1, len1, s1);
ffc61ed2 1931 });
6eb5f6b9
JH
1932 if (find_byclass(prog, c, s, strend, startpos, 0))
1933 goto got_it;
bf93d4cc 1934 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1935 }
1936 else {
1937 dontbother = 0;
33b8afdf
JH
1938 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1939 /* Trim the end. */
d6a28714 1940 char *last;
33b8afdf
JH
1941 SV* float_real;
1942
1943 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1944 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1945 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1946
1947 if (flags & REXEC_SCREAM) {
33b8afdf 1948 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1949 end_shift, &scream_pos, 1); /* last one */
1950 if (!last)
ffc61ed2 1951 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1952 /* we may be pointing at the wrong string */
1953 else if (RX_MATCH_COPIED(prog))
1954 s = prog->subbeg + (s - SvPVX(sv));
b8c5462f 1955 }
d6a28714
JH
1956 else {
1957 STRLEN len;
33b8afdf 1958 char *little = SvPV(float_real, len);
d6a28714 1959
33b8afdf 1960 if (SvTAIL(float_real)) {
d6a28714
JH
1961 if (memEQ(strend - len + 1, little, len - 1))
1962 last = strend - len + 1;
1963 else if (!PL_multiline)
9041c2e3 1964 last = memEQ(strend - len, little, len)
d6a28714 1965 ? strend - len : Nullch;
b8c5462f 1966 else
d6a28714
JH
1967 goto find_last;
1968 } else {
1969 find_last:
9041c2e3 1970 if (len)
d6a28714 1971 last = rninstr(s, strend, little, little + len);
b8c5462f 1972 else
d6a28714 1973 last = strend; /* matching `$' */
b8c5462f 1974 }
b8c5462f 1975 }
bf93d4cc
GS
1976 if (last == NULL) {
1977 DEBUG_r(PerlIO_printf(Perl_debug_log,
1978 "%sCan't trim the tail, match fails (should not happen)%s\n",
1979 PL_colors[4],PL_colors[5]));
1980 goto phooey; /* Should not happen! */
1981 }
d6a28714
JH
1982 dontbother = strend - last + prog->float_min_offset;
1983 }
1984 if (minlen && (dontbother < minlen))
1985 dontbother = minlen - 1;
1986 strend -= dontbother; /* this one's always in bytes! */
1987 /* We don't know much -- general case. */
1aa99e6b 1988 if (do_utf8) {
d6a28714
JH
1989 for (;;) {
1990 if (regtry(prog, s))
1991 goto got_it;
1992 if (s >= strend)
1993 break;
b8c5462f 1994 s += UTF8SKIP(s);
d6a28714
JH
1995 };
1996 }
1997 else {
1998 do {
1999 if (regtry(prog, s))
2000 goto got_it;
2001 } while (s++ < strend);
2002 }
2003 }
2004
2005 /* Failure. */
2006 goto phooey;
2007
2008got_it:
2009 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2010
2011 if (PL_reg_eval_set) {
2012 /* Preserve the current value of $^R */
2013 if (oreplsv != GvSV(PL_replgv))
2014 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2015 restored, the value remains
2016 the same. */
acfe0abc 2017 restore_pos(aTHX_ 0);
d6a28714
JH
2018 }
2019
2020 /* make sure $`, $&, $', and $digit will work later */
2021 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 2022 RX_MATCH_COPY_FREE(prog);
d6a28714
JH
2023 if (flags & REXEC_COPY_STR) {
2024 I32 i = PL_regeol - startpos + (stringarg - strbeg);
ed252734
NC
2025#ifdef PERL_COPY_ON_WRITE
2026 if ((SvIsCOW(sv)
2027 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2028 if (DEBUG_C_TEST) {
2029 PerlIO_printf(Perl_debug_log,
2030 "Copy on write: regexp capture, type %d\n",
2031 (int) SvTYPE(sv));
2032 }
2033 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2034 prog->subbeg = SvPVX(prog->saved_copy);
2035 assert (SvPOKp(prog->saved_copy));
2036 } else
2037#endif
2038 {
2039 RX_MATCH_COPIED_on(prog);
2040 s = savepvn(strbeg, i);
2041 prog->subbeg = s;
2042 }
d6a28714 2043 prog->sublen = i;
d6a28714
JH
2044 }
2045 else {
2046 prog->subbeg = strbeg;
2047 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2048 }
2049 }
9041c2e3 2050
d6a28714
JH
2051 return 1;
2052
2053phooey:
bf93d4cc
GS
2054 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2055 PL_colors[4],PL_colors[5]));
d6a28714 2056 if (PL_reg_eval_set)
acfe0abc 2057 restore_pos(aTHX_ 0);
d6a28714
JH
2058 return 0;
2059}
2060
2061/*
2062 - regtry - try match at specific point
2063 */
2064STATIC I32 /* 0 failure, 1 success */
2065S_regtry(pTHX_ regexp *prog, char *startpos)
2066{
d6a28714
JH
2067 register I32 i;
2068 register I32 *sp;
2069 register I32 *ep;
2070 CHECKPOINT lastcp;
2071
02db2b7b
IZ
2072#ifdef DEBUGGING
2073 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2074#endif
d6a28714
JH
2075 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2076 MAGIC *mg;
2077
2078 PL_reg_eval_set = RS_init;
2079 DEBUG_r(DEBUG_s(
b900a521
JH
2080 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2081 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2082 ));
e8347627 2083 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2084 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2085 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2086 SAVETMPS;
2087 /* Apparently this is not needed, judging by wantarray. */
e8347627 2088 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2089 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2090
2091 if (PL_reg_sv) {
2092 /* Make $_ available to executed code. */
2093 if (PL_reg_sv != DEFSV) {
4d1ff10f 2094 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2095 SAVESPTR(DEFSV);
2096 DEFSV = PL_reg_sv;
b8c5462f 2097 }
d6a28714 2098
9041c2e3 2099 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2100 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2101 /* prepare for quick setting of pos */
14befaf4
DM
2102 sv_magic(PL_reg_sv, (SV*)0,
2103 PERL_MAGIC_regex_global, Nullch, 0);
2104 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2105 mg->mg_len = -1;
b8c5462f 2106 }
d6a28714
JH
2107 PL_reg_magic = mg;
2108 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2109 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2110 }
09687e5a 2111 if (!PL_reg_curpm) {
0f79a09d 2112 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2113#ifdef USE_ITHREADS
2114 {
2115 SV* repointer = newSViv(0);
577e12cc 2116 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2117 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2118 av_push(PL_regex_padav,repointer);
2119 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2120 PL_regex_pad = AvARRAY(PL_regex_padav);
2121 }
2122#endif
2123 }
aaa362c4 2124 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2125 PL_reg_oldcurpm = PL_curpm;
2126 PL_curpm = PL_reg_curpm;
2127 if (RX_MATCH_COPIED(prog)) {
2128 /* Here is a serious problem: we cannot rewrite subbeg,
2129 since it may be needed if this match fails. Thus
2130 $` inside (?{}) could fail... */
2131 PL_reg_oldsaved = prog->subbeg;
2132 PL_reg_oldsavedlen = prog->sublen;
ed252734
NC
2133#ifdef PERL_COPY_ON_WRITE
2134 PL_nrs = prog->saved_copy;
2135#endif
d6a28714
JH
2136 RX_MATCH_COPIED_off(prog);
2137 }
2138 else
2139 PL_reg_oldsaved = Nullch;
2140 prog->subbeg = PL_bostr;
2141 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2142 }
2143 prog->startp[0] = startpos - PL_bostr;
2144 PL_reginput = startpos;
2145 PL_regstartp = prog->startp;
2146 PL_regendp = prog->endp;
2147 PL_reglastparen = &prog->lastparen;
a01268b5 2148 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2149 prog->lastparen = 0;
03994de8 2150 prog->lastcloseparen = 0;
d6a28714
JH
2151 PL_regsize = 0;
2152 DEBUG_r(PL_reg_starttry = startpos);
2153 if (PL_reg_start_tmpl <= prog->nparens) {
2154 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2155 if(PL_reg_start_tmp)
2156 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2157 else
2158 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2159 }
2160
2161 /* XXXX What this code is doing here?!!! There should be no need
2162 to do this again and again, PL_reglastparen should take care of
3dd2943c 2163 this! --ilya*/
dafc8851
JH
2164
2165 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2166 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2167 * PL_reglastparen), is not needed at all by the test suite
2168 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2169 * enough, for building DynaLoader, or otherwise this
2170 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2171 * will happen. Meanwhile, this code *is* needed for the
2172 * above-mentioned test suite tests to succeed. The common theme
2173 * on those tests seems to be returning null fields from matches.
2174 * --jhi */
dafc8851 2175#if 1
d6a28714
JH
2176 sp = prog->startp;
2177 ep = prog->endp;
2178 if (prog->nparens) {
eb160463 2179 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2180 *++sp = -1;
2181 *++ep = -1;
2182 }
2183 }
dafc8851 2184#endif
02db2b7b 2185 REGCP_SET(lastcp);
d6a28714
JH
2186 if (regmatch(prog->program + 1)) {
2187 prog->endp[0] = PL_reginput - PL_bostr;
2188 return 1;
2189 }
02db2b7b 2190 REGCP_UNWIND(lastcp);
d6a28714
JH
2191 return 0;
2192}
2193
02db2b7b
IZ
2194#define RE_UNWIND_BRANCH 1
2195#define RE_UNWIND_BRANCHJ 2
2196
2197union re_unwind_t;
2198
2199typedef struct { /* XX: makes sense to enlarge it... */
2200 I32 type;
2201 I32 prev;
2202 CHECKPOINT lastcp;
2203} re_unwind_generic_t;
2204
2205typedef struct {
2206 I32 type;
2207 I32 prev;
2208 CHECKPOINT lastcp;
2209 I32 lastparen;
2210 regnode *next;
2211 char *locinput;
2212 I32 nextchr;
2213#ifdef DEBUGGING
2214 int regindent;
2215#endif
2216} re_unwind_branch_t;
2217
2218typedef union re_unwind_t {
2219 I32 type;
2220 re_unwind_generic_t generic;
2221 re_unwind_branch_t branch;
2222} re_unwind_t;
2223
8ba1375e
MJD
2224#define sayYES goto yes
2225#define sayNO goto no
e0f9d4a8 2226#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2227#define sayYES_FINAL goto yes_final
2228#define sayYES_LOUD goto yes_loud
2229#define sayNO_FINAL goto no_final
2230#define sayNO_SILENT goto do_no
2231#define saySAME(x) if (x) goto yes; else goto no
2232
2233#define REPORT_CODE_OFF 24
2234
d6a28714
JH
2235/*
2236 - regmatch - main matching routine
2237 *
2238 * Conceptually the strategy is simple: check to see whether the current
2239 * node matches, call self recursively to see whether the rest matches,
2240 * and then act accordingly. In practice we make some effort to avoid
2241 * recursion, in particular by going through "ordinary" nodes (that don't
2242 * need to know whether the rest of the match failed) by a loop instead of
2243 * by recursion.
2244 */
2245/* [lwall] I've hoisted the register declarations to the outer block in order to
2246 * maybe save a little bit of pushing and popping on the stack. It also takes
2247 * advantage of machines that use a register save mask on subroutine entry.
2248 */
2249STATIC I32 /* 0 failure, 1 success */
2250S_regmatch(pTHX_ regnode *prog)
2251{
d6a28714
JH
2252 register regnode *scan; /* Current node. */
2253 regnode *next; /* Next node. */
2254 regnode *inner; /* Next node in internal branch. */
2255 register I32 nextchr; /* renamed nextchr - nextchar colides with
2256 function of same name */
2257 register I32 n; /* no or next */
b7953727
JH
2258 register I32 ln = 0; /* len or last */
2259 register char *s = Nullch; /* operand or save */
d6a28714 2260 register char *locinput = PL_reginput;
b7953727 2261 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2262 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2263 I32 unwind = 0;
b7953727 2264#if 0
02db2b7b 2265 I32 firstcp = PL_savestack_ix;
b7953727 2266#endif
53c4c00c 2267 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2268#ifdef DEBUGGING
ce333219
JH
2269 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2270 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2271 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2272#endif
02db2b7b 2273
d6a28714
JH
2274#ifdef DEBUGGING
2275 PL_regindent++;
2276#endif
2277
2278 /* Note that nextchr is a byte even in UTF */
2279 nextchr = UCHARAT(locinput);
2280 scan = prog;
2281 while (scan != NULL) {
8ba1375e 2282
2a782b5b 2283 DEBUG_r( {
d6a28714
JH
2284 SV *prop = sv_newmortal();
2285 int docolor = *PL_colors[0];
2286 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2287 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2288 /* The part of the string before starttry has one color
2289 (pref0_len chars), between starttry and current
2290 position another one (pref_len - pref0_len chars),
2291 after the current position the third one.
2292 We assume that pref0_len <= pref_len, otherwise we
2293 decrease pref0_len. */
9041c2e3 2294 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2295 ? (5 + taill) - l : locinput - PL_bostr;
2296 int pref0_len;
d6a28714 2297
df1ffd02 2298 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2299 pref_len++;
2300 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2301 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2302 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2303 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2304 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2305 l--;
d6a28714
JH
2306 if (pref0_len < 0)
2307 pref0_len = 0;
2308 if (pref0_len > pref_len)
2309 pref0_len = pref_len;
2310 regprop(prop, scan);
2a782b5b
JH
2311 {
2312 char *s0 =
f14c76ed 2313 do_utf8 && OP(scan) != CANY ?
2a782b5b 2314 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2315 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2316 locinput - pref_len;
df1ffd02 2317 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2318 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2319 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2320 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2321 locinput - pref_len + pref0_len;
df1ffd02 2322 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2323 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2324 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2325 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2326 locinput;
df1ffd02 2327 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2328 PerlIO_printf(Perl_debug_log,
2329 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2330 (IV)(locinput - PL_bostr),
2331 PL_colors[4],
2332 len0, s0,
2333 PL_colors[5],
2334 PL_colors[2],
2335 len1, s1,
2336 PL_colors[3],
2337 (docolor ? "" : "> <"),
2338 PL_colors[0],
2339 len2, s2,
2340 PL_colors[1],
2341 15 - l - pref_len + 1,
2342 "",
2343 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2344 SvPVX(prop));
2345 }
2346 });
d6a28714
JH
2347
2348 next = scan + NEXT_OFF(scan);
2349 if (next == scan)
2350 next = NULL;
2351
2352 switch (OP(scan)) {
2353 case BOL:
12d33761
HS
2354 if (locinput == PL_bostr || (PL_multiline &&
2355 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2356 {
2357 /* regtill = regbol; */
b8c5462f
JH
2358 break;
2359 }
d6a28714
JH
2360 sayNO;
2361 case MBOL:
12d33761
HS
2362 if (locinput == PL_bostr ||
2363 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2364 {
b8c5462f
JH
2365 break;
2366 }
d6a28714
JH
2367 sayNO;
2368 case SBOL:
c2a73568 2369 if (locinput == PL_bostr)
b8c5462f 2370 break;
d6a28714
JH
2371 sayNO;
2372 case GPOS:
2373 if (locinput == PL_reg_ganch)
2374 break;
2375 sayNO;
2376 case EOL:
2377 if (PL_multiline)
2378 goto meol;
2379 else
2380 goto seol;
2381 case MEOL:
2382 meol:
2383 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2384 sayNO;
b8c5462f 2385 break;
d6a28714
JH
2386 case SEOL:
2387 seol:
2388 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2389 sayNO;
d6a28714 2390 if (PL_regeol - locinput > 1)
b8c5462f 2391 sayNO;
b8c5462f 2392 break;
d6a28714
JH
2393 case EOS:
2394 if (PL_regeol != locinput)
b8c5462f 2395 sayNO;
d6a28714 2396 break;
ffc61ed2 2397 case SANY:
d6a28714 2398 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2399 sayNO;
f33976b4
DB
2400 if (do_utf8) {
2401 locinput += PL_utf8skip[nextchr];
2402 if (locinput > PL_regeol)
2403 sayNO;
2404 nextchr = UCHARAT(locinput);
2405 }
2406 else
2407 nextchr = UCHARAT(++locinput);
2408 break;
2409 case CANY:
2410 if (!nextchr && locinput >= PL_regeol)
2411 sayNO;
b8c5462f 2412 nextchr = UCHARAT(++locinput);
a0d0e21e 2413 break;
ffc61ed2 2414 case REG_ANY:
1aa99e6b
IH
2415 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2416 sayNO;
2417 if (do_utf8) {
b8c5462f 2418 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2419 if (locinput > PL_regeol)
2420 sayNO;
a0ed51b3 2421 nextchr = UCHARAT(locinput);
a0ed51b3 2422 }
1aa99e6b
IH
2423 else
2424 nextchr = UCHARAT(++locinput);
a0ed51b3 2425 break;
d6a28714 2426 case EXACT:
cd439c50
IZ
2427 s = STRING(scan);
2428 ln = STR_LEN(scan);
eb160463 2429 if (do_utf8 != UTF) {
bc517b45 2430 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2431 char *l = locinput;
2432 char *e = s + ln;
bc517b45 2433 STRLEN ulen;
a72c7584 2434
5ff6fc6d
JH
2435 if (do_utf8) {
2436 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2437 while (s < e) {
2438 if (l >= PL_regeol)
5ff6fc6d
JH
2439 sayNO;
2440 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2441 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2442 ckWARN(WARN_UTF8) ?
2443 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2444 sayNO;
bc517b45 2445 l += ulen;
5ff6fc6d 2446 s ++;
1aa99e6b 2447 }
5ff6fc6d
JH
2448 }
2449 else {
2450 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2451 while (s < e) {
2452 if (l >= PL_regeol)
2453 sayNO;
5ff6fc6d 2454 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2455 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2456 ckWARN(WARN_UTF8) ?
2457 0 : UTF8_ALLOW_ANY))
1aa99e6b 2458 sayNO;
bc517b45 2459 s += ulen;
a72c7584 2460 l ++;
1aa99e6b 2461 }
5ff6fc6d 2462 }
1aa99e6b
IH
2463 locinput = l;
2464 nextchr = UCHARAT(locinput);
2465 break;
2466 }
bc517b45 2467 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2468 /* Inline the first character, for speed. */
2469 if (UCHARAT(s) != nextchr)
2470 sayNO;
2471 if (PL_regeol - locinput < ln)
2472 sayNO;
2473 if (ln > 1 && memNE(s, locinput, ln))
2474 sayNO;
2475 locinput += ln;
2476 nextchr = UCHARAT(locinput);
2477 break;
2478 case EXACTFL:
b8c5462f
JH
2479 PL_reg_flags |= RF_tainted;
2480 /* FALL THROUGH */
d6a28714 2481 case EXACTF:
cd439c50
IZ
2482 s = STRING(scan);
2483 ln = STR_LEN(scan);
d6a28714 2484
d07ddd77
JH
2485 if (do_utf8 || UTF) {
2486 /* Either target or the pattern are utf8. */
d6a28714 2487 char *l = locinput;
d07ddd77 2488 char *e = PL_regeol;
bc517b45 2489
eb160463 2490 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2491 l, &e, 0, do_utf8)) {
5486206c
JH
2492 /* One more case for the sharp s:
2493 * pack("U0U*", 0xDF) =~ /ss/i,
2494 * the 0xC3 0x9F are the UTF-8
2495 * byte sequence for the U+00DF. */
2496 if (!(do_utf8 &&
2497 toLOWER(s[0]) == 's' &&
2498 ln >= 2 &&
2499 toLOWER(s[1]) == 's' &&
2500 (U8)l[0] == 0xC3 &&
2501 e - l >= 2 &&
2502 (U8)l[1] == 0x9F))
2503 sayNO;
2504 }
d07ddd77
JH
2505 locinput = e;
2506 nextchr = UCHARAT(locinput);
2507 break;
a0ed51b3 2508 }
d6a28714 2509
bc517b45
JH
2510 /* Neither the target and the pattern are utf8. */
2511
d6a28714
JH
2512 /* Inline the first character, for speed. */
2513 if (UCHARAT(s) != nextchr &&
2514 UCHARAT(s) != ((OP(scan) == EXACTF)
2515 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2516 sayNO;
d6a28714 2517 if (PL_regeol - locinput < ln)
b8c5462f 2518 sayNO;
d6a28714
JH
2519 if (ln > 1 && (OP(scan) == EXACTF
2520 ? ibcmp(s, locinput, ln)
2521 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2522 sayNO;
d6a28714
JH
2523 locinput += ln;
2524 nextchr = UCHARAT(locinput);
a0d0e21e 2525 break;
d6a28714 2526 case ANYOF:
ffc61ed2 2527 if (do_utf8) {
9e55ce06
JH
2528 STRLEN inclasslen = PL_regeol - locinput;
2529
ba7b4546 2530 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2531 sayNO_ANYOF;
ffc61ed2
JH
2532 if (locinput >= PL_regeol)
2533 sayNO;
0f0076b4 2534 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2535 nextchr = UCHARAT(locinput);
e0f9d4a8 2536 break;
ffc61ed2
JH
2537 }
2538 else {
2539 if (nextchr < 0)
2540 nextchr = UCHARAT(locinput);
7d3e948e 2541 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2542 sayNO_ANYOF;
ffc61ed2
JH
2543 if (!nextchr && locinput >= PL_regeol)
2544 sayNO;
2545 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2546 break;
2547 }
2548 no_anyof:
2549 /* If we might have the case of the German sharp s
2550 * in a casefolding Unicode character class. */
2551
ebc501f0
JH
2552 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2553 locinput += SHARP_S_SKIP;
e0f9d4a8 2554 nextchr = UCHARAT(locinput);
ffc61ed2 2555 }
e0f9d4a8
JH
2556 else
2557 sayNO;
b8c5462f 2558 break;
d6a28714 2559 case ALNUML:
b8c5462f
JH
2560 PL_reg_flags |= RF_tainted;
2561 /* FALL THROUGH */
d6a28714 2562 case ALNUM:
b8c5462f 2563 if (!nextchr)
4633a7c4 2564 sayNO;
ffc61ed2 2565 if (do_utf8) {
ad24be35 2566 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2567 if (!(OP(scan) == ALNUM
3568d838 2568 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2569 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2570 {
2571 sayNO;
a0ed51b3 2572 }
b8c5462f 2573 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2574 nextchr = UCHARAT(locinput);
2575 break;
2576 }
ffc61ed2 2577 if (!(OP(scan) == ALNUM
d6a28714 2578 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2579 sayNO;
b8c5462f 2580 nextchr = UCHARAT(++locinput);
a0d0e21e 2581 break;
d6a28714 2582 case NALNUML:
b8c5462f
JH
2583 PL_reg_flags |= RF_tainted;
2584 /* FALL THROUGH */
d6a28714
JH
2585 case NALNUM:
2586 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2587 sayNO;
ffc61ed2 2588 if (do_utf8) {
8269fa76 2589 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2590 if (OP(scan) == NALNUM
3568d838 2591 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2592 : isALNUM_LC_utf8((U8*)locinput))
2593 {
b8c5462f 2594 sayNO;
d6a28714 2595 }
b8c5462f
JH
2596 locinput += PL_utf8skip[nextchr];
2597 nextchr = UCHARAT(locinput);
2598 break;
2599 }
ffc61ed2 2600 if (OP(scan) == NALNUM
d6a28714 2601 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2602 sayNO;
76e3520e 2603 nextchr = UCHARAT(++locinput);
a0d0e21e 2604 break;
d6a28714
JH
2605 case BOUNDL:
2606 case NBOUNDL:
3280af22 2607 PL_reg_flags |= RF_tainted;
bbce6d69 2608 /* FALL THROUGH */
d6a28714
JH
2609 case BOUND:
2610 case NBOUND:
2611 /* was last char in word? */
ffc61ed2 2612 if (do_utf8) {
12d33761
HS
2613 if (locinput == PL_bostr)
2614 ln = '\n';
ffc61ed2 2615 else {
b4f7163a 2616 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2617
b4f7163a 2618 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2619 }
2620 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2621 ln = isALNUM_uni(ln);
8269fa76 2622 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2623 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2624 }
2625 else {
9041c2e3 2626 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2627 n = isALNUM_LC_utf8((U8*)locinput);
2628 }
a0ed51b3 2629 }
d6a28714 2630 else {
12d33761
HS
2631 ln = (locinput != PL_bostr) ?
2632 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2633 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2634 ln = isALNUM(ln);
2635 n = isALNUM(nextchr);
2636 }
2637 else {
2638 ln = isALNUM_LC(ln);
2639 n = isALNUM_LC(nextchr);
2640 }
d6a28714 2641 }
ffc61ed2
JH
2642 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2643 OP(scan) == BOUNDL))
2644 sayNO;
a0ed51b3 2645 break;
d6a28714 2646 case SPACEL:
3280af22 2647 PL_reg_flags |= RF_tainted;
bbce6d69 2648 /* FALL THROUGH */
d6a28714 2649 case SPACE:
9442cb0e 2650 if (!nextchr)
4633a7c4 2651 sayNO;
1aa99e6b 2652 if (do_utf8) {
fd400ab9 2653 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2654 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2655 if (!(OP(scan) == SPACE
3568d838 2656 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2657 : isSPACE_LC_utf8((U8*)locinput)))
2658 {
2659 sayNO;
2660 }
2661 locinput += PL_utf8skip[nextchr];
2662 nextchr = UCHARAT(locinput);
2663 break;
d6a28714 2664 }
ffc61ed2
JH
2665 if (!(OP(scan) == SPACE
2666 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2667 sayNO;
2668 nextchr = UCHARAT(++locinput);
2669 }
2670 else {
2671 if (!(OP(scan) == SPACE
2672 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2673 sayNO;
2674 nextchr = UCHARAT(++locinput);
a0ed51b3 2675 }
a0ed51b3 2676 break;
d6a28714 2677 case NSPACEL:
3280af22 2678 PL_reg_flags |= RF_tainted;
bbce6d69 2679 /* FALL THROUGH */
d6a28714 2680 case NSPACE:
9442cb0e 2681 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2682 sayNO;
1aa99e6b 2683 if (do_utf8) {
8269fa76 2684 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2685 if (OP(scan) == NSPACE
3568d838 2686 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2687 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2688 {
2689 sayNO;
2690 }
2691 locinput += PL_utf8skip[nextchr];
2692 nextchr = UCHARAT(locinput);
2693 break;
a0ed51b3 2694 }
ffc61ed2 2695 if (OP(scan) == NSPACE
d6a28714 2696 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2697 sayNO;
76e3520e 2698 nextchr = UCHARAT(++locinput);
a0d0e21e 2699 break;
d6a28714 2700 case DIGITL:
a0ed51b3
LW
2701 PL_reg_flags |= RF_tainted;
2702 /* FALL THROUGH */
d6a28714 2703 case DIGIT:
9442cb0e 2704 if (!nextchr)
a0ed51b3 2705 sayNO;
1aa99e6b 2706 if (do_utf8) {
8269fa76 2707 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2708 if (!(OP(scan) == DIGIT
3568d838 2709 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2710 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2711 {
a0ed51b3 2712 sayNO;
dfe13c55 2713 }
6f06b55f 2714 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2715 nextchr = UCHARAT(locinput);
2716 break;
2717 }
ffc61ed2 2718 if (!(OP(scan) == DIGIT
9442cb0e 2719 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2720 sayNO;
2721 nextchr = UCHARAT(++locinput);
2722 break;
d6a28714 2723 case NDIGITL:
b8c5462f
JH
2724 PL_reg_flags |= RF_tainted;
2725 /* FALL THROUGH */
d6a28714 2726 case NDIGIT:
9442cb0e 2727 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2728 sayNO;
1aa99e6b 2729 if (do_utf8) {
8269fa76 2730 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2731 if (OP(scan) == NDIGIT
3568d838 2732 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2733 : isDIGIT_LC_utf8((U8*)locinput))
2734 {
a0ed51b3 2735 sayNO;
9442cb0e 2736 }
6f06b55f 2737 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2738 nextchr = UCHARAT(locinput);
2739 break;
2740 }
ffc61ed2 2741 if (OP(scan) == NDIGIT
9442cb0e 2742 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2743 sayNO;
2744 nextchr = UCHARAT(++locinput);
2745 break;
2746 case CLUMP:
b7c83a7e 2747 if (locinput >= PL_regeol)
a0ed51b3 2748 sayNO;
b7c83a7e
JH
2749 if (do_utf8) {
2750 LOAD_UTF8_CHARCLASS(mark,"~");
2751 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2752 sayNO;
2753 locinput += PL_utf8skip[nextchr];
2754 while (locinput < PL_regeol &&
2755 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2756 locinput += UTF8SKIP(locinput);
2757 if (locinput > PL_regeol)
2758 sayNO;
eb08e2da
JH
2759 }
2760 else
2761 locinput++;
a0ed51b3
LW
2762 nextchr = UCHARAT(locinput);
2763 break;
c8756f30 2764 case REFFL:
3280af22 2765 PL_reg_flags |= RF_tainted;
c8756f30 2766 /* FALL THROUGH */
c277df42 2767 case REF:
c8756f30 2768 case REFF:
c277df42 2769 n = ARG(scan); /* which paren pair */
cf93c79d 2770 ln = PL_regstartp[n];
2c2d71f5 2771 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2772 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2773 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2774 if (ln == PL_regendp[n])
a0d0e21e 2775 break;
a0ed51b3 2776
cf93c79d 2777 s = PL_bostr + ln;
1aa99e6b 2778 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2779 char *l = locinput;
cf93c79d 2780 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2781 /*
2782 * Note that we can't do the "other character" lookup trick as
2783 * in the 8-bit case (no pun intended) because in Unicode we
2784 * have to map both upper and title case to lower case.
2785 */
2786 if (OP(scan) == REFF) {
a2a2844f 2787 STRLEN ulen1, ulen2;
e7ae6809
JH
2788 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2789 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2790 while (s < e) {
2791 if (l >= PL_regeol)
2792 sayNO;
a2a2844f
JH
2793 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2794 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2795 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2796 sayNO;
a2a2844f
JH
2797 s += ulen1;
2798 l += ulen2;
a0ed51b3
LW
2799 }
2800 }
2801 locinput = l;
2802 nextchr = UCHARAT(locinput);
2803 break;
2804 }
2805
a0d0e21e 2806 /* Inline the first character, for speed. */
76e3520e 2807 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2808 (OP(scan) == REF ||
2809 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2810 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2811 sayNO;
cf93c79d 2812 ln = PL_regendp[n] - ln;
3280af22 2813 if (locinput + ln > PL_regeol)
4633a7c4 2814 sayNO;
c8756f30
AK
2815 if (ln > 1 && (OP(scan) == REF
2816 ? memNE(s, locinput, ln)
2817 : (OP(scan) == REFF
2818 ? ibcmp(s, locinput, ln)
2819 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2820 sayNO;
a0d0e21e 2821 locinput += ln;
76e3520e 2822 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2823 break;
2824
2825 case NOTHING:
c277df42 2826 case TAIL:
a0d0e21e
LW
2827 break;
2828 case BACK:
2829 break;
c277df42
IZ
2830 case EVAL:
2831 {
2832 dSP;
533c011a 2833 OP_4tree *oop = PL_op;
3280af22 2834 COP *ocurcop = PL_curcop;
f3548bdc 2835 PAD *old_comppad;
c277df42 2836 SV *ret;
9041c2e3 2837
c277df42 2838 n = ARG(scan);
533c011a 2839 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2840 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2841 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2842 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2843
8e5e9ebe
RGS
2844 {
2845 SV **before = SP;
2846 CALLRUNOPS(aTHX); /* Scalar context. */
2847 SPAGAIN;
2848 if (SP == before)
075aa684 2849 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2850 else {
2851 ret = POPs;
2852 PUTBACK;
2853 }
2854 }
2855
0f5d15d6 2856 PL_op = oop;
f3548bdc 2857 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2858 PL_curcop = ocurcop;
c277df42 2859 if (logical) {
0f5d15d6
IZ
2860 if (logical == 2) { /* Postponed subexpression. */
2861 regexp *re;
22c35a8c 2862 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2863 re_cc_state state;
0f5d15d6 2864 CHECKPOINT cp, lastcp;
cb50f42d 2865 int toggleutf;
faf82a0b 2866 register SV *sv;
0f5d15d6 2867
faf82a0b
AE
2868 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2869 mg = mg_find(sv, PERL_MAGIC_qr);
2870 else if (SvSMAGICAL(ret)) {
2871 if (SvGMAGICAL(ret))
2872 sv_unmagic(ret, PERL_MAGIC_qr);
2873 else
2874 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 2875 }
faf82a0b 2876
0f5d15d6
IZ
2877 if (mg) {
2878 re = (regexp *)mg->mg_obj;
df0003d4 2879 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2880 }
2881 else {
2882 STRLEN len;
2883 char *t = SvPV(ret, len);
2884 PMOP pm;
2885 char *oprecomp = PL_regprecomp;
2886 I32 osize = PL_regsize;
2887 I32 onpar = PL_regnpar;
2888
5fcd1c1b 2889 Zero(&pm, 1, PMOP);
cb50f42d 2890 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2891 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2892 if (!(SvFLAGS(ret)
faf82a0b
AE
2893 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2894 | SVs_GMG)))
14befaf4
DM
2895 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2896 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2897 PL_regprecomp = oprecomp;
2898 PL_regsize = osize;
2899 PL_regnpar = onpar;
2900 }
2901 DEBUG_r(
9041c2e3 2902 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2903 "Entering embedded `%s%.60s%s%s'\n",
2904 PL_colors[0],
2905 re->precomp,
2906 PL_colors[1],
2907 (strlen(re->precomp) > 60 ? "..." : ""))
2908 );
2909 state.node = next;
2910 state.prev = PL_reg_call_cc;
2911 state.cc = PL_regcc;
2912 state.re = PL_reg_re;
2913
2ab05381 2914 PL_regcc = 0;
9041c2e3 2915
0f5d15d6 2916 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2917 REGCP_SET(lastcp);
0f5d15d6
IZ
2918 cache_re(re);
2919 state.ss = PL_savestack_ix;
2920 *PL_reglastparen = 0;
a01268b5 2921 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2922 PL_reg_call_cc = &state;
2923 PL_reginput = locinput;
cb50f42d
YST
2924 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2925 ((re->reganch & ROPT_UTF8) != 0);
2926 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2927
2928 /* XXXX This is too dramatic a measure... */
2929 PL_reg_maxiter = 0;
2930
0f5d15d6 2931 if (regmatch(re->program + 1)) {
2c914db6
IZ
2932 /* Even though we succeeded, we need to restore
2933 global variables, since we may be wrapped inside
2934 SUSPEND, thus the match may be not finished yet. */
2935
2936 /* XXXX Do this only if SUSPENDed? */
2937 PL_reg_call_cc = state.prev;
2938 PL_regcc = state.cc;
2939 PL_reg_re = state.re;
2940 cache_re(PL_reg_re);
cb50f42d 2941 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2942
2943 /* XXXX This is too dramatic a measure... */
2944 PL_reg_maxiter = 0;
2945
2946 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2947 ReREFCNT_dec(re);
2948 regcpblow(cp);
2949 sayYES;
2950 }
0f5d15d6 2951 ReREFCNT_dec(re);
02db2b7b 2952 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2953 regcppop();
2954 PL_reg_call_cc = state.prev;
2955 PL_regcc = state.cc;
2956 PL_reg_re = state.re;
d3790889 2957 cache_re(PL_reg_re);
cb50f42d 2958 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2959
2960 /* XXXX This is too dramatic a measure... */
2961 PL_reg_maxiter = 0;
2962
8e514ae6 2963 logical = 0;
0f5d15d6
IZ
2964 sayNO;
2965 }
c277df42 2966 sw = SvTRUE(ret);
0f5d15d6 2967 logical = 0;
a0ed51b3
LW
2968 }
2969 else
3280af22 2970 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2971 break;
2972 }
a0d0e21e 2973 case OPEN:
c277df42 2974 n = ARG(scan); /* which paren pair */
3280af22
NIS
2975 PL_reg_start_tmp[n] = locinput;
2976 if (n > PL_regsize)
2977 PL_regsize = n;
a0d0e21e
LW
2978 break;
2979 case CLOSE:
c277df42 2980 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2981 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2982 PL_regendp[n] = locinput - PL_bostr;
eb160463 2983 if (n > (I32)*PL_reglastparen)
3280af22 2984 *PL_reglastparen = n;
a01268b5 2985 *PL_reglastcloseparen = n;
a0d0e21e 2986 break;
c277df42
IZ
2987 case GROUPP:
2988 n = ARG(scan); /* which paren pair */
eb160463 2989 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2990 break;
2991 case IFTHEN:
2c2d71f5 2992 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2993 if (sw)
2994 next = NEXTOPER(NEXTOPER(scan));
2995 else {
2996 next = scan + ARG(scan);
2997 if (OP(next) == IFTHEN) /* Fake one. */
2998 next = NEXTOPER(NEXTOPER(next));
2999 }
3000 break;
3001 case LOGICAL:
0f5d15d6 3002 logical = scan->flags;
c277df42 3003 break;
2ab05381
IZ
3004/*******************************************************************
3005 PL_regcc contains infoblock about the innermost (...)* loop, and
3006 a pointer to the next outer infoblock.
3007
3008 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3009
3010 1) After matching X, regnode for CURLYX is processed;
3011
9041c2e3 3012 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
3013 regmatch() recursively with the starting point at WHILEM node;
3014
3015 3) Each hit of WHILEM node tries to match A and Z (in the order
3016 depending on the current iteration, min/max of {min,max} and
3017 greediness). The information about where are nodes for "A"
3018 and "Z" is read from the infoblock, as is info on how many times "A"
3019 was already matched, and greediness.
3020
3021 4) After A matches, the same WHILEM node is hit again.
3022
3023 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3024 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3025 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3026 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3027 of the external loop.
3028
3029 Currently present infoblocks form a tree with a stem formed by PL_curcc
3030 and whatever it mentions via ->next, and additional attached trees
3031 corresponding to temporarily unset infoblocks as in "5" above.
3032
9041c2e3 3033 In the following picture infoblocks for outer loop of
2ab05381
IZ
3034 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3035 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3036 infoblocks are drawn below the "reset" infoblock.
3037
3038 In fact in the picture below we do not show failed matches for Z and T
3039 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3040 more obvious *why* one needs to *temporary* unset infoblocks.]
3041
3042 Matched REx position InfoBlocks Comment
3043 (Y(A)*?Z)*?T x
3044 Y(A)*?Z)*?T x <- O
3045 Y (A)*?Z)*?T x <- O
3046 Y A)*?Z)*?T x <- O <- I
3047 YA )*?Z)*?T x <- O <- I
3048 YA A)*?Z)*?T x <- O <- I
3049 YAA )*?Z)*?T x <- O <- I
3050 YAA Z)*?T x <- O # Temporary unset I
3051 I
3052
3053 YAAZ Y(A)*?Z)*?T x <- O
3054 I
3055
3056 YAAZY (A)*?Z)*?T x <- O
3057 I
3058
3059 YAAZY A)*?Z)*?T x <- O <- I
3060 I
3061
3062 YAAZYA )*?Z)*?T x <- O <- I
3063 I
3064
3065 YAAZYA Z)*?T x <- O # Temporary unset I
3066 I,I
3067
3068 YAAZYAZ )*?T x <- O
3069 I,I
3070
3071 YAAZYAZ T x # Temporary unset O
3072 O
3073 I,I
3074
3075 YAAZYAZT x
3076 O
3077 I,I
3078 *******************************************************************/
a0d0e21e
LW
3079 case CURLYX: {
3080 CURCUR cc;
3280af22 3081 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3082 /* No need to save/restore up to this paren */
3083 I32 parenfloor = scan->flags;
c277df42
IZ
3084
3085 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3086 next += ARG(next);
3280af22
NIS
3087 cc.oldcc = PL_regcc;
3088 PL_regcc = &cc;
cb434fcc
IZ
3089 /* XXXX Probably it is better to teach regpush to support
3090 parenfloor > PL_regsize... */
eb160463 3091 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3092 parenfloor = *PL_reglastparen; /* Pessimization... */
3093 cc.parenfloor = parenfloor;
a0d0e21e
LW
3094 cc.cur = -1;
3095 cc.min = ARG1(scan);
3096 cc.max = ARG2(scan);
c277df42 3097 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3098 cc.next = next;
3099 cc.minmod = minmod;
3100 cc.lastloc = 0;
3280af22 3101 PL_reginput = locinput;
a0d0e21e
LW
3102 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3103 regcpblow(cp);
3280af22 3104 PL_regcc = cc.oldcc;
4633a7c4 3105 saySAME(n);
a0d0e21e
LW
3106 }
3107 /* NOT REACHED */
3108 case WHILEM: {
3109 /*
3110 * This is really hard to understand, because after we match
3111 * what we're trying to match, we must make sure the rest of
2c2d71f5 3112 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3113 * to go back UP the parse tree by recursing ever deeper. And
3114 * if it fails, we have to reset our parent's current state
3115 * that we can try again after backing off.
3116 */
3117
c277df42 3118 CHECKPOINT cp, lastcp;
3280af22 3119 CURCUR* cc = PL_regcc;
c277df42
IZ
3120 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3121
4633a7c4 3122 n = cc->cur + 1; /* how many we know we matched */
3280af22 3123 PL_reginput = locinput;
a0d0e21e 3124
c277df42 3125 DEBUG_r(
9041c2e3 3126 PerlIO_printf(Perl_debug_log,
91f3b821 3127 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3128 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3129 (long)n, (long)cc->min,
2797576d 3130 (long)cc->max, PTR2UV(cc))
c277df42 3131 );
4633a7c4 3132
a0d0e21e
LW
3133 /* If degenerate scan matches "", assume scan done. */
3134
579cf2c3 3135 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3136 PL_regcc = cc->oldcc;
2ab05381
IZ
3137 if (PL_regcc)
3138 ln = PL_regcc->cur;
c277df42 3139 DEBUG_r(
c3464db5
DD
3140 PerlIO_printf(Perl_debug_log,
3141 "%*s empty match detected, try continuation...\n",
3280af22 3142 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3143 );
a0d0e21e 3144 if (regmatch(cc->next))
4633a7c4 3145 sayYES;
2ab05381
IZ
3146 if (PL_regcc)
3147 PL_regcc->cur = ln;
3280af22 3148 PL_regcc = cc;
4633a7c4 3149 sayNO;
a0d0e21e
LW
3150 }
3151
3152 /* First just match a string of min scans. */
3153
3154 if (n < cc->min) {
3155 cc->cur = n;
3156 cc->lastloc = locinput;
4633a7c4
LW
3157 if (regmatch(cc->scan))
3158 sayYES;
3159 cc->cur = n - 1;
c277df42 3160 cc->lastloc = lastloc;
4633a7c4 3161 sayNO;
a0d0e21e
LW
3162 }
3163
2c2d71f5
JH
3164 if (scan->flags) {
3165 /* Check whether we already were at this position.
3166 Postpone detection until we know the match is not
3167 *that* much linear. */
3168 if (!PL_reg_maxiter) {
3169 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3170 PL_reg_leftiter = PL_reg_maxiter;
3171 }
3172 if (PL_reg_leftiter-- == 0) {
3173 I32 size = (PL_reg_maxiter + 7)/8;
3174 if (PL_reg_poscache) {
eb160463 3175 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3176 Renew(PL_reg_poscache, size, char);
3177 PL_reg_poscache_size = size;
3178 }
3179 Zero(PL_reg_poscache, size, char);
3180 }
3181 else {
3182 PL_reg_poscache_size = size;
3183 Newz(29, PL_reg_poscache, size, char);
3184 }
3185 DEBUG_r(
3186 PerlIO_printf(Perl_debug_log,
3187 "%sDetected a super-linear match, switching on caching%s...\n",
3188 PL_colors[4], PL_colors[5])
3189 );
3190 }
3191 if (PL_reg_leftiter < 0) {
3192 I32 o = locinput - PL_bostr, b;
3193
3194 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3195 b = o % 8;
3196 o /= 8;
3197 if (PL_reg_poscache[o] & (1<<b)) {
3198 DEBUG_r(
3199 PerlIO_printf(Perl_debug_log,
3200 "%*s already tried at this position...\n",
3201 REPORT_CODE_OFF+PL_regindent*2, "")
3202 );
7821416a 3203 sayNO_SILENT;
2c2d71f5
JH
3204 }
3205 PL_reg_poscache[o] |= (1<<b);
3206 }
3207 }
3208
a0d0e21e
LW
3209 /* Prefer next over scan for minimal matching. */
3210
3211 if (cc->minmod) {
3280af22 3212 PL_regcc = cc->oldcc;
2ab05381
IZ
3213 if (PL_regcc)
3214 ln = PL_regcc->cur;
5f05dabc 3215 cp = regcppush(cc->parenfloor);
02db2b7b 3216 REGCP_SET(lastcp);
5f05dabc 3217 if (regmatch(cc->next)) {
c277df42 3218 regcpblow(cp);
4633a7c4 3219 sayYES; /* All done. */
5f05dabc 3220 }
02db2b7b 3221 REGCP_UNWIND(lastcp);
5f05dabc 3222 regcppop();
2ab05381
IZ
3223 if (PL_regcc)
3224 PL_regcc->cur = ln;
3280af22 3225 PL_regcc = cc;
a0d0e21e 3226
c277df42 3227 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3228 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3229 && !(PL_reg_flags & RF_warned)) {
3230 PL_reg_flags |= RF_warned;
9014280d 3231 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3232 "Complex regular subexpression recursion",
3233 REG_INFTY - 1);
c277df42 3234 }
4633a7c4 3235 sayNO;
c277df42 3236 }
a687059c 3237
c277df42 3238 DEBUG_r(
c3464db5
DD
3239 PerlIO_printf(Perl_debug_log,
3240 "%*s trying longer...\n",
3280af22 3241 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3242 );
a0d0e21e 3243 /* Try scanning more and see if it helps. */
3280af22 3244 PL_reginput = locinput;
a0d0e21e
LW
3245 cc->cur = n;
3246 cc->lastloc = locinput;
5f05dabc 3247 cp = regcppush(cc->parenfloor);
02db2b7b 3248 REGCP_SET(lastcp);
5f05dabc 3249 if (regmatch(cc->scan)) {
c277df42 3250 regcpblow(cp);
4633a7c4 3251 sayYES;
5f05dabc 3252 }
02db2b7b 3253 REGCP_UNWIND(lastcp);
5f05dabc 3254 regcppop();
4633a7c4 3255 cc->cur = n - 1;
c277df42 3256 cc->lastloc = lastloc;
4633a7c4 3257 sayNO;
a0d0e21e
LW
3258 }
3259
3260 /* Prefer scan over next for maximal matching. */
3261
3262 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3263 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3264 cc->cur = n;
3265 cc->lastloc = locinput;
02db2b7b 3266 REGCP_SET(lastcp);
5f05dabc 3267 if (regmatch(cc->scan)) {
c277df42 3268 regcpblow(cp);
4633a7c4 3269 sayYES;
5f05dabc 3270 }
02db2b7b 3271 REGCP_UNWIND(lastcp);
a0d0e21e 3272 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3273 PL_reginput = locinput;
c277df42 3274 DEBUG_r(
c3464db5
DD
3275 PerlIO_printf(Perl_debug_log,
3276 "%*s failed, try continuation...\n",
3280af22 3277 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3278 );
3279 }
9041c2e3 3280 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3281 && !(PL_reg_flags & RF_warned)) {
3280af22 3282 PL_reg_flags |= RF_warned;
9014280d 3283 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3284 "Complex regular subexpression recursion",
3285 REG_INFTY - 1);
a0d0e21e
LW
3286 }
3287
3288 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3289 PL_regcc = cc->oldcc;
2ab05381
IZ
3290 if (PL_regcc)
3291 ln = PL_regcc->cur;
a0d0e21e 3292 if (regmatch(cc->next))
4633a7c4 3293 sayYES;
2ab05381
IZ
3294 if (PL_regcc)
3295 PL_regcc->cur = ln;
3280af22 3296 PL_regcc = cc;
4633a7c4 3297 cc->cur = n - 1;
c277df42 3298 cc->lastloc = lastloc;
4633a7c4 3299 sayNO;
a0d0e21e
LW
3300 }
3301 /* NOT REACHED */
9041c2e3 3302 case BRANCHJ:
c277df42
IZ
3303 next = scan + ARG(scan);
3304 if (next == scan)
3305 next = NULL;
3306 inner = NEXTOPER(NEXTOPER(scan));
3307 goto do_branch;
9041c2e3 3308 case BRANCH:
c277df42
IZ
3309 inner = NEXTOPER(scan);
3310 do_branch:
3311 {
c277df42
IZ
3312 c1 = OP(scan);
3313 if (OP(next) != c1) /* No choice. */
3314 next = inner; /* Avoid recursion. */
a0d0e21e 3315 else {
02db2b7b
IZ
3316 I32 lastparen = *PL_reglastparen;
3317 I32 unwind1;
3318 re_unwind_branch_t *uw;
3319
3320 /* Put unwinding data on stack */
3321 unwind1 = SSNEWt(1,re_unwind_branch_t);
3322 uw = SSPTRt(unwind1,re_unwind_branch_t);
3323 uw->prev = unwind;
3324 unwind = unwind1;
3325 uw->type = ((c1 == BRANCH)
3326 ? RE_UNWIND_BRANCH
3327 : RE_UNWIND_BRANCHJ);
3328 uw->lastparen = lastparen;
3329 uw->next = next;
3330 uw->locinput = locinput;
3331 uw->nextchr = nextchr;
3332#ifdef DEBUGGING
3333 uw->regindent = ++PL_regindent;
3334#endif
c277df42 3335
02db2b7b
IZ
3336 REGCP_SET(uw->lastcp);
3337
3338 /* Now go into the first branch */
3339 next = inner;
a687059c 3340 }
a0d0e21e
LW
3341 }
3342 break;
3343 case MINMOD:
3344 minmod = 1;
3345 break;
c277df42
IZ
3346 case CURLYM:
3347 {
00db4c45 3348 I32 l = 0;
c277df42 3349 CHECKPOINT lastcp;
9041c2e3 3350
c277df42
IZ
3351 /* We suppose that the next guy does not need
3352 backtracking: in particular, it is of constant length,
3353 and has no parenths to influence future backrefs. */
3354 ln = ARG1(scan); /* min to match */
3355 n = ARG2(scan); /* max to match */
c277df42
IZ
3356 paren = scan->flags;
3357 if (paren) {
3280af22
NIS
3358 if (paren > PL_regsize)
3359 PL_regsize = paren;
eb160463 3360 if (paren > (I32)*PL_reglastparen)
3280af22 3361 *PL_reglastparen = paren;
c277df42 3362 }
dc45a647 3363 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3364 if (paren)
3365 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3366 PL_reginput = locinput;
c277df42
IZ
3367 if (minmod) {
3368 minmod = 0;
3369 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3370 sayNO;
f31a99c8
HS
3371 /* if we matched something zero-length we don't need to
3372 backtrack - capturing parens are already defined, so
3373 the caveat in the maximal case doesn't apply
3374
3375 XXXX if ln == 0, we can redo this check first time
3376 through the following loop
3377 */
3378 if (ln && l == 0)
3379 n = ln; /* don't backtrack */
3280af22 3380 locinput = PL_reginput;
cca55fe3 3381 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3382 regnode *text_node = next;
3383
cca55fe3 3384 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3385
cca55fe3 3386 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3387 else {
cca55fe3
JP
3388 if (PL_regkind[(U8)OP(text_node)] == REF) {
3389 I32 n, ln;
3390 n = ARG(text_node); /* which paren pair */
3391 ln = PL_regstartp[n];
3392 /* assume yes if we haven't seen CLOSEn */
3393 if (
eb160463 3394 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3395 ln == -1 ||
3396 ln == PL_regendp[n]
3397 ) {
3398 c1 = c2 = -1000;
3399 goto assume_ok_MM;
3400 }
3401 c1 = *(PL_bostr + ln);
3402 }
3403 else { c1 = (U8)*STRING(text_node); }
af5decee 3404 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3405 c2 = PL_fold[c1];
af5decee 3406 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3407 c2 = PL_fold_locale[c1];
3408 else
3409 c2 = c1;
3410 }
a0ed51b3
LW
3411 }
3412 else
c277df42 3413 c1 = c2 = -1000;
cca55fe3 3414 assume_ok_MM:
02db2b7b 3415 REGCP_SET(lastcp);
5f4b28b2 3416 /* This may be improved if l == 0. */
c277df42
IZ
3417 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3418 /* If it could work, try it. */
3419 if (c1 == -1000 ||
3280af22
NIS
3420 UCHARAT(PL_reginput) == c1 ||
3421 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3422 {
3423 if (paren) {
f31a99c8 3424 if (ln) {
cf93c79d
IZ
3425 PL_regstartp[paren] =
3426 HOPc(PL_reginput, -l) - PL_bostr;
3427 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3428 }
3429 else
cf93c79d 3430 PL_regendp[paren] = -1;
c277df42
IZ
3431 }
3432 if (regmatch(next))
3433 sayYES;
02db2b7b 3434 REGCP_UNWIND(lastcp);
c277df42
IZ
3435 }
3436 /* Couldn't or didn't -- move forward. */
3280af22 3437 PL_reginput = locinput;
c277df42
IZ
3438 if (regrepeat_hard(scan, 1, &l)) {
3439 ln++;
3280af22 3440 locinput = PL_reginput;
c277df42
IZ
3441 }
3442 else
3443 sayNO;
3444 }
a0ed51b3
LW
3445 }
3446 else {
c277df42 3447 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3448 /* if we matched something zero-length we don't need to
3449 backtrack, unless the minimum count is zero and we
3450 are capturing the result - in that case the capture
3451 being defined or not may affect later execution
3452 */
3453 if (n != 0 && l == 0 && !(paren && ln == 0))
3454 ln = n; /* don't backtrack */
3280af22 3455 locinput = PL_reginput;
c277df42 3456 DEBUG_r(
5c0ca799 3457 PerlIO_printf(Perl_debug_log,
faccc32b 3458 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3459 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3460 (IV) n, (IV)l)
c277df42
IZ
3461 );
3462 if (n >= ln) {
cca55fe3 3463 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3464 regnode *text_node = next;
3465
cca55fe3 3466 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3467
cca55fe3 3468 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3469 else {
cca55fe3
JP
3470 if (PL_regkind[(U8)OP(text_node)] == REF) {
3471 I32 n, ln;
3472 n = ARG(text_node); /* which paren pair */
3473 ln = PL_regstartp[n];
3474 /* assume yes if we haven't seen CLOSEn */
3475 if (
eb160463 3476 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3477 ln == -1 ||
3478 ln == PL_regendp[n]
3479 ) {
3480 c1 = c2 = -1000;
3481 goto assume_ok_REG;
3482 }
3483 c1 = *(PL_bostr + ln);
3484 }
3485 else { c1 = (U8)*STRING(text_node); }
3486
af5decee 3487 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3488 c2 = PL_fold[c1];
af5decee 3489 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3490 c2 = PL_fold_locale[c1];
3491 else
3492 c2 = c1;
3493 }
a0ed51b3
LW
3494 }
3495 else
c277df42
IZ
3496 c1 = c2 = -1000;
3497 }
cca55fe3 3498 assume_ok_REG:
02db2b7b 3499 REGCP_SET(lastcp);
c277df42
IZ
3500 while (n >= ln) {
3501 /* If it could work, try it. */
3502 if (c1 == -1000 ||
3280af22
NIS
3503 UCHARAT(PL_reginput) == c1 ||
3504 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3505 {
3506 DEBUG_r(
c3464db5 3507 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3508 "%*s trying tail with n=%"IVdf"...\n",
3509 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3510 );
3511 if (paren) {
3512 if (n) {
cf93c79d
IZ
3513 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3514 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3515 }
a0ed51b3 3516 else
cf93c79d 3517 PL_regendp[paren] = -1;
c277df42 3518 }
a0ed51b3
LW
3519 if (regmatch(next))
3520 sayYES;
02db2b7b 3521 REGCP_UNWIND(lastcp);
a0ed51b3 3522 }
c277df42
IZ
3523 /* Couldn't or didn't -- back up. */
3524 n--;
dfe13c55 3525 locinput = HOPc(locinput, -l);
3280af22 3526 PL_reginput = locinput;
c277df42
IZ
3527 }
3528 }
3529 sayNO;
3530 break;
3531 }
3532 case CURLYN:
3533 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3534 if (paren > PL_regsize)
3535 PL_regsize = paren;
eb160463 3536 if (paren > (I32)*PL_reglastparen)
3280af22 3537 *PL_reglastparen = paren;
c277df42
IZ
3538 ln = ARG1(scan); /* min to match */
3539 n = ARG2(scan); /* max to match */
dc45a647 3540 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3541 goto repeat;
a0d0e21e 3542 case CURLY:
c277df42 3543 paren = 0;
a0d0e21e
LW
3544 ln = ARG1(scan); /* min to match */
3545 n = ARG2(scan); /* max to match */
dc45a647 3546 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3547 goto repeat;
3548 case STAR:
3549 ln = 0;
c277df42 3550 n = REG_INFTY;
a0d0e21e 3551 scan = NEXTOPER(scan);
c277df42 3552 paren = 0;
a0d0e21e
LW
3553 goto repeat;
3554 case PLUS:
c277df42
IZ
3555 ln = 1;
3556 n = REG_INFTY;
3557 scan = NEXTOPER(scan);
3558 paren = 0;
3559 repeat:
a0d0e21e
LW
3560 /*
3561 * Lookahead to avoid useless match attempts
3562 * when we know what character comes next.
3563 */
5f80c4cf
JP
3564
3565 /*
3566 * Used to only do .*x and .*?x, but now it allows
3567 * for )'s, ('s and (?{ ... })'s to be in the way
3568 * of the quantifier and the EXACT-like node. -- japhy
3569 */
3570
cca55fe3 3571 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3572 U8 *s;
3573 regnode *text_node = next;
3574
cca55fe3 3575 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3576
cca55fe3 3577 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3578 else {
cca55fe3
JP
3579 if (PL_regkind[(U8)OP(text_node)] == REF) {
3580 I32 n, ln;
3581 n = ARG(text_node); /* which paren pair */
3582 ln = PL_regstartp[n];
3583 /* assume yes if we haven't seen CLOSEn */
3584 if (
eb160463 3585 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3586 ln == -1 ||
3587 ln == PL_regendp[n]
3588 ) {
3589 c1 = c2 = -1000;
3590 goto assume_ok_easy;
3591 }
9246c65e 3592 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3593 }
3594 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3595
3596 if (!UTF) {
3597 c2 = c1 = *s;
f65d3ee7 3598 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3599 c2 = PL_fold[c1];
f65d3ee7 3600 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3601 c2 = PL_fold_locale[c1];
1aa99e6b 3602 }
5f80c4cf 3603 else { /* UTF */
f65d3ee7 3604 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3605 STRLEN ulen1, ulen2;
e7ae6809
JH
3606 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3607 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3608
3609 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3610 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3611
872c91ae
JH
3612 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3613 ckWARN(WARN_UTF8) ?
3614 0 : UTF8_ALLOW_ANY);
3615 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3616 ckWARN(WARN_UTF8) ?
3617 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3618 }
3619 else {
872c91ae
JH
3620 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3621 ckWARN(WARN_UTF8) ?
3622 0 : UTF8_ALLOW_ANY);
5f80c4cf 3623 }
1aa99e6b
IH
3624 }
3625 }
bbce6d69 3626 }
a0d0e21e 3627 else
bbce6d69 3628 c1 = c2 = -1000;
cca55fe3 3629 assume_ok_easy:
3280af22 3630 PL_reginput = locinput;
a0d0e21e 3631 if (minmod) {
c277df42 3632 CHECKPOINT lastcp;
a0d0e21e
LW
3633 minmod = 0;
3634 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3635 sayNO;
a0ed51b3 3636 locinput = PL_reginput;
02db2b7b 3637 REGCP_SET(lastcp);
0fe9bf95 3638 if (c1 != -1000) {
1aa99e6b 3639 char *e; /* Should not check after this */
0fe9bf95 3640 char *old = locinput;
b2f2f093 3641 int count = 0;
0fe9bf95 3642
1aa99e6b 3643 if (n == REG_INFTY) {
0fe9bf95 3644 e = PL_regeol - 1;
1aa99e6b
IH
3645 if (do_utf8)
3646 while (UTF8_IS_CONTINUATION(*(U8*)e))
3647 e--;
3648 }
3649 else if (do_utf8) {
3650 int m = n - ln;
3651 for (e = locinput;
3652 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3653 e += UTF8SKIP(e);
3654 }
3655 else {
3656 e = locinput + n - ln;
3657 if (e >= PL_regeol)
3658 e = PL_regeol - 1;
3659 }
0fe9bf95
IZ
3660 while (1) {
3661 /* Find place 'next' could work */
1aa99e6b
IH
3662 if (!do_utf8) {
3663 if (c1 == c2) {
a8e8ab15
JH
3664 while (locinput <= e &&
3665 UCHARAT(locinput) != c1)
1aa99e6b
IH
3666 locinput++;
3667 } else {
9041c2e3 3668 while (locinput <= e
a8e8ab15
JH
3669 && UCHARAT(locinput) != c1
3670 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3671 locinput++;
3672 }
3673 count = locinput - old;
3674 }
3675 else {
3676 STRLEN len;
3677 if (c1 == c2) {
872c91ae
JH
3678 /* count initialised to
3679 * utf8_distance(old, locinput) */
b2f2f093 3680 while (locinput <= e &&
872c91ae
JH
3681 utf8n_to_uvchr((U8*)locinput,
3682 UTF8_MAXLEN, &len,
3683 ckWARN(WARN_UTF8) ?
eb160463 3684 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3685 locinput += len;
b2f2f093
JH
3686 count++;
3687 }
1aa99e6b 3688 } else {
872c91ae
JH
3689 /* count initialised to
3690 * utf8_distance(old, locinput) */
b2f2f093 3691 while (locinput <= e) {
872c91ae
JH
3692 UV c = utf8n_to_uvchr((U8*)locinput,
3693 UTF8_MAXLEN, &len,
3694 ckWARN(WARN_UTF8) ?
3695 0 : UTF8_ALLOW_ANY);
eb160463 3696 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3697 break;
b2f2f093
JH
3698 locinput += len;
3699 count++;
1aa99e6b
IH
3700 }
3701 }
0fe9bf95 3702 }
9041c2e3 3703 if (locinput > e)
0fe9bf95
IZ
3704 sayNO;
3705 /* PL_reginput == old now */
3706 if (locinput != old) {
3707 ln = 1; /* Did some */
1aa99e6b 3708 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3709 sayNO;
3710 }
3711 /* PL_reginput == locinput now */
29d1e993 3712 TRYPAREN(paren, ln, locinput);
0fe9bf95 3713 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3714 REGCP_UNWIND(lastcp);
0fe9bf95 3715 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3716 old = locinput;
3717 if (do_utf8)
3718 locinput += UTF8SKIP(locinput);
3719 else
3720 locinput++;
b2f2f093 3721 count = 1;
0fe9bf95
IZ
3722 }
3723 }
3724 else
c277df42 3725 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3726 UV c;
3727 if (c1 != -1000) {
3728 if (do_utf8)
872c91ae
JH
3729 c = utf8n_to_uvchr((U8*)PL_reginput,
3730 UTF8_MAXLEN, 0,
3731 ckWARN(WARN_UTF8) ?
3732 0 : UTF8_ALLOW_ANY);
1aa99e6b 3733 else
9041c2e3 3734 c = UCHARAT(PL_reginput);
2390ecbc 3735 /* If it could work, try it. */
eb160463 3736 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3737 {
ecc99935 3738 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3739 REGCP_UNWIND(lastcp);
3740 }
1aa99e6b 3741 }
a0d0e21e 3742 /* If it could work, try it. */
2390ecbc 3743 else if (c1 == -1000)
bbce6d69 3744 {
ecc99935 3745 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3746 REGCP_UNWIND(lastcp);
bbce6d69 3747 }
c277df42 3748 /* Couldn't or didn't -- move forward. */
a0ed51b3 3749 PL_reginput = locinput;
a0d0e21e
LW
3750 if (regrepeat(scan, 1)) {
3751 ln++;
a0ed51b3
LW
3752 locinput = PL_reginput;
3753 }
3754 else
4633a7c4 3755 sayNO;
a0d0e21e
LW
3756 }
3757 }
3758 else {
c277df42 3759 CHECKPOINT lastcp;
a0d0e21e 3760 n = regrepeat(scan, n);
a0ed51b3 3761 locinput = PL_reginput;
22c35a8c 3762 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3763 ((!PL_multiline && OP(next) != MEOL) ||
3764 OP(next) == SEOL || OP(next) == EOS))
3765 {
a0d0e21e 3766 ln = n; /* why back off? */
1aeab75a
GS
3767 /* ...because $ and \Z can match before *and* after
3768 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3769 We should back off by one in this case. */
3770 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3771 ln--;
3772 }
02db2b7b 3773 REGCP_SET(lastcp);
c277df42 3774 if (paren) {
8fa7f367 3775 UV c = 0;
c277df42 3776 while (n >= ln) {
1aa99e6b
IH
3777 if (c1 != -1000) {
3778 if (do_utf8)
872c91ae
JH
3779 c = utf8n_to_uvchr((U8*)PL_reginput,
3780 UTF8_MAXLEN, 0,
3781 ckWARN(WARN_UTF8) ?
3782 0 : UTF8_ALLOW_ANY);
1aa99e6b 3783 else
9041c2e3 3784 c = UCHARAT(PL_reginput);
1aa99e6b 3785 }
c277df42 3786 /* If it could work, try it. */
eb160463 3787 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3788 {
29d1e993 3789 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3790 REGCP_UNWIND(lastcp);
c277df42
IZ
3791 }
3792 /* Couldn't or didn't -- back up. */
3793 n--;
dfe13c55 3794 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3795 }
a0ed51b3
LW
3796 }
3797 else {
8fa7f367 3798 UV c = 0;
c277df42 3799 while (n >= ln) {
1aa99e6b
IH
3800 if (c1 != -1000) {
3801 if (do_utf8)
872c91ae
JH
3802 c = utf8n_to_uvchr((U8*)PL_reginput,
3803 UTF8_MAXLEN, 0,
3804 ckWARN(WARN_UTF8) ?
3805 0 : UTF8_ALLOW_ANY);
1aa99e6b 3806 else
9041c2e3 3807 c = UCHARAT(PL_reginput);
1aa99e6b 3808 }
c277df42 3809 /* If it could work, try it. */
eb160463 3810 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3811 {
29d1e993 3812 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3813 REGCP_UNWIND(lastcp);
c277df42
IZ
3814 }
3815 /* Couldn't or didn't -- back up. */
3816 n--;
dfe13c55 3817 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3818 }
a0d0e21e
LW
3819 }
3820 }
4633a7c4 3821 sayNO;
c277df42 3822 break;
a0d0e21e 3823 case END:
0f5d15d6
IZ
3824 if (PL_reg_call_cc) {
3825 re_cc_state *cur_call_cc = PL_reg_call_cc;
3826 CURCUR *cctmp = PL_regcc;
3827 regexp *re = PL_reg_re;
3828 CHECKPOINT cp, lastcp;
3829
3830 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3831 REGCP_SET(lastcp);
0f5d15d6
IZ
3832 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3833 the caller. */
3834 PL_reginput = locinput; /* Make position available to
3835 the callcc. */
3836 cache_re(PL_reg_call_cc->re);
3837 PL_regcc = PL_reg_call_cc->cc;
3838 PL_reg_call_cc = PL_reg_call_cc->prev;
3839 if (regmatch(cur_call_cc->node)) {
3840 PL_reg_call_cc = cur_call_cc;
3841 regcpblow(cp);
3842 sayYES;
3843 }
02db2b7b 3844 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3845 regcppop();
3846 PL_reg_call_cc = cur_call_cc;
3847 PL_regcc = cctmp;
3848 PL_reg_re = re;
3849 cache_re(re);
3850
3851 DEBUG_r(
3852 PerlIO_printf(Perl_debug_log,
3853 "%*s continuation failed...\n",
3854 REPORT_CODE_OFF+PL_regindent*2, "")
3855 );
7821416a 3856 sayNO_SILENT;
0f5d15d6 3857 }
7821416a
IZ
3858 if (locinput < PL_regtill) {
3859 DEBUG_r(PerlIO_printf(Perl_debug_log,
3860 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3861 PL_colors[4],
3862 (long)(locinput - PL_reg_starttry),
3863 (long)(PL_regtill - PL_reg_starttry),
3864 PL_colors[5]));
3865 sayNO_FINAL; /* Cannot match: too short. */
3866 }
3867 PL_reginput = locinput; /* put where regtry can find it */
3868 sayYES_FINAL; /* Success! */
7e5428c5 3869 case SUCCEED:
3280af22 3870 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3871 sayYES_LOUD; /* Success! */
c277df42
IZ
3872 case SUSPEND:
3873 n = 1;
9fe1d20c 3874 PL_reginput = locinput;
9041c2e3 3875 goto do_ifmatch;
a0d0e21e 3876 case UNLESSM:
c277df42 3877 n = 0;
a0ed51b3 3878 if (scan->flags) {
efb30f32
HS
3879 s = HOPBACKc(locinput, scan->flags);
3880 if (!s)
3881 goto say_yes;
3882 PL_reginput = s;
a0ed51b3
LW
3883 }
3884 else
3885 PL_reginput = locinput;
c277df42
IZ
3886 goto do_ifmatch;
3887 case IFMATCH:
3888 n = 1;
a0ed51b3 3889 if (scan->flags) {
efb30f32
HS
3890 s = HOPBACKc(locinput, scan->flags);
3891 if (!s)
3892 goto say_no;
3893 PL_reginput = s;
a0ed51b3
LW
3894 }
3895 else
3896 PL_reginput = locinput;
3897
c277df42 3898 do_ifmatch:
c277df42
IZ
3899 inner = NEXTOPER(NEXTOPER(scan));
3900 if (regmatch(inner) != n) {
3901 say_no:
3902 if (logical) {
3903 logical = 0;
3904 sw = 0;
3905 goto do_longjump;
a0ed51b3
LW
3906 }
3907 else
c277df42
IZ
3908 sayNO;
3909 }
3910 say_yes:
3911 if (logical) {
3912 logical = 0;
3913 sw = 1;
3914 }
fe44a5e8 3915 if (OP(scan) == SUSPEND) {
3280af22 3916 locinput = PL_reginput;
565764a8 3917 nextchr = UCHARAT(locinput);
fe44a5e8 3918 }
c277df42
IZ
3919 /* FALL THROUGH. */
3920 case LONGJMP:
3921 do_longjump:
3922 next = scan + ARG(scan);
3923 if (next == scan)
3924 next = NULL;
a0d0e21e
LW
3925 break;
3926 default:
b900a521 3927 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3928 PTR2UV(scan), OP(scan));
cea2e8a9 3929 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3930 }
02db2b7b 3931 reenter:
a0d0e21e
LW
3932 scan = next;
3933 }
a687059c 3934
a0d0e21e
LW
3935 /*
3936 * We get here only if there's trouble -- normally "case END" is
3937 * the terminating point.
3938 */
cea2e8a9 3939 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3940 /*NOTREACHED*/
4633a7c4
LW
3941 sayNO;
3942
7821416a
IZ
3943yes_loud:
3944 DEBUG_r(
3945 PerlIO_printf(Perl_debug_log,
3946 "%*s %scould match...%s\n",
3947 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3948 );
3949 goto yes;
3950yes_final:
3951 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3952 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3953yes:
3954#ifdef DEBUGGING
3280af22 3955 PL_regindent--;
4633a7c4 3956#endif
02db2b7b
IZ
3957
3958#if 0 /* Breaks $^R */
3959 if (unwind)
3960 regcpblow(firstcp);
3961#endif
4633a7c4
LW
3962 return 1;
3963
3964no:
7821416a
IZ
3965 DEBUG_r(
3966 PerlIO_printf(Perl_debug_log,
3967 "%*s %sfailed...%s\n",
3968 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3969 );
3970 goto do_no;
3971no_final:
3972do_no:
02db2b7b
IZ
3973 if (unwind) {
3974 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3975
3976 switch (uw->type) {
3977 case RE_UNWIND_BRANCH:
3978 case RE_UNWIND_BRANCHJ:
3979 {
3980 re_unwind_branch_t *uwb = &(uw->branch);
3981 I32 lastparen = uwb->lastparen;
9041c2e3 3982
02db2b7b
IZ
3983 REGCP_UNWIND(uwb->lastcp);
3984 for (n = *PL_reglastparen; n > lastparen; n--)
3985 PL_regendp[n] = -1;
3986 *PL_reglastparen = n;
3987 scan = next = uwb->next;
9041c2e3
NIS
3988 if ( !scan ||
3989 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3990 ? BRANCH : BRANCHJ) ) { /* Failure */
3991 unwind = uwb->prev;
3992#ifdef DEBUGGING
3993 PL_regindent--;
3994#endif
3995 goto do_no;
3996 }
3997 /* Have more choice yet. Reuse the same uwb. */
3998 /*SUPPRESS 560*/
3999 if ((n = (uwb->type == RE_UNWIND_BRANCH
4000 ? NEXT_OFF(next) : ARG(next))))
4001 next += n;
4002 else
4003 next = NULL; /* XXXX Needn't unwinding in this case... */
4004 uwb->next = next;
4005 next = NEXTOPER(scan);
4006 if (uwb->type == RE_UNWIND_BRANCHJ)
4007 next = NEXTOPER(next);
4008 locinput = uwb->locinput;
4009 nextchr = uwb->nextchr;
4010#ifdef DEBUGGING
4011 PL_regindent = uwb->regindent;
4012#endif
4013
4014 goto reenter;
4015 }
4016 /* NOT REACHED */
4017 default:
4018 Perl_croak(aTHX_ "regexp unwind memory corruption");
4019 }
4020 /* NOT REACHED */
4021 }
4633a7c4 4022#ifdef DEBUGGING
3280af22 4023 PL_regindent--;
4633a7c4 4024#endif
a0d0e21e 4025 return 0;
a687059c
LW
4026}
4027
4028/*
4029 - regrepeat - repeatedly match something simple, report how many
4030 */
4031/*
4032 * [This routine now assumes that it will only match on things of length 1.
4033 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4034 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4035 */
76e3520e 4036STATIC I32
cea2e8a9 4037S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4038{
a0d0e21e 4039 register char *scan;
a0d0e21e 4040 register I32 c;
3280af22 4041 register char *loceol = PL_regeol;
a0ed51b3 4042 register I32 hardcount = 0;
53c4c00c 4043 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4044
3280af22 4045 scan = PL_reginput;
faf11cac
HS
4046 if (max == REG_INFTY)
4047 max = I32_MAX;
4048 else if (max < loceol - scan)
a0d0e21e 4049 loceol = scan + max;
a0d0e21e 4050 switch (OP(p)) {
22c35a8c 4051 case REG_ANY:
1aa99e6b 4052 if (do_utf8) {
ffc61ed2 4053 loceol = PL_regeol;
1aa99e6b 4054 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4055 scan += UTF8SKIP(scan);
4056 hardcount++;
4057 }
4058 } else {
4059 while (scan < loceol && *scan != '\n')
4060 scan++;
a0ed51b3
LW
4061 }
4062 break;
ffc61ed2 4063 case SANY:
def8e4ea
JH
4064 if (do_utf8) {
4065 loceol = PL_regeol;
a0804c9e 4066 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4067 scan += UTF8SKIP(scan);
4068 hardcount++;
4069 }
4070 }
4071 else
4072 scan = loceol;
a0ed51b3 4073 break;
f33976b4
DB
4074 case CANY:
4075 scan = loceol;
4076 break;
bbce6d69 4077 case EXACT: /* length of string is 1 */
cd439c50 4078 c = (U8)*STRING(p);
bbce6d69 4079 while (scan < loceol && UCHARAT(scan) == c)
4080 scan++;
4081 break;
4082 case EXACTF: /* length of string is 1 */
cd439c50 4083 c = (U8)*STRING(p);
bbce6d69 4084 while (scan < loceol &&
22c35a8c 4085 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4086 scan++;
4087 break;
4088 case EXACTFL: /* length of string is 1 */
3280af22 4089 PL_reg_flags |= RF_tainted;
cd439c50 4090 c = (U8)*STRING(p);
bbce6d69 4091 while (scan < loceol &&
22c35a8c 4092 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4093 scan++;
4094 break;
4095 case ANYOF:
ffc61ed2
JH
4096 if (do_utf8) {
4097 loceol = PL_regeol;
cfc92286
JH
4098 while (hardcount < max && scan < loceol &&
4099 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4100 scan += UTF8SKIP(scan);
4101 hardcount++;
4102 }
4103 } else {
7d3e948e 4104 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4105 scan++;
4106 }
a0d0e21e
LW
4107 break;
4108 case ALNUM:
1aa99e6b 4109 if (do_utf8) {
ffc61ed2 4110 loceol = PL_regeol;
8269fa76 4111 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4112 while (hardcount < max && scan < loceol &&
3568d838 4113 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4114 scan += UTF8SKIP(scan);
4115 hardcount++;
4116 }
4117 } else {
4118 while (scan < loceol && isALNUM(*scan))
4119 scan++;
a0ed51b3
LW
4120 }
4121 break;
bbce6d69 4122 case ALNUML:
3280af22 4123 PL_reg_flags |= RF_tainted;
1aa99e6b 4124 if (do_utf8) {
ffc61ed2 4125 loceol = PL_regeol;
1aa99e6b
IH
4126 while (hardcount < max && scan < loceol &&
4127 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4128 scan += UTF8SKIP(scan);
4129 hardcount++;
4130 }
4131 } else {
4132 while (scan < loceol && isALNUM_LC(*scan))
4133 scan++;
a0ed51b3
LW
4134 }
4135 break;
a0d0e21e 4136 case NALNUM:
1aa99e6b 4137 if (do_utf8) {
ffc61ed2 4138 loceol = PL_regeol;
8269fa76 4139 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4140 while (hardcount < max && scan < loceol &&
3568d838 4141 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4142 scan += UTF8SKIP(scan);
4143 hardcount++;
4144 }
4145 } else {
4146 while (scan < loceol && !isALNUM(*scan))
4147 scan++;
a0ed51b3
LW
4148 }
4149 break;
bbce6d69 4150 case NALNUML:
3280af22 4151 PL_reg_flags |= RF_tainted;
1aa99e6b 4152 if (do_utf8) {
ffc61ed2 4153 loceol = PL_regeol;
1aa99e6b
IH
4154 while (hardcount < max && scan < loceol &&
4155 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4156 scan += UTF8SKIP(scan);
4157 hardcount++;
4158 }
4159 } else {
4160 while (scan < loceol && !isALNUM_LC(*scan))
4161 scan++;
a0ed51b3
LW
4162 }
4163 break;
a0d0e21e 4164 case SPACE:
1aa99e6b 4165 if (do_utf8) {
ffc61ed2 4166 loceol = PL_regeol;
8269fa76 4167 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4168 while (hardcount < max && scan < loceol &&
3568d838
JH
4169 (*scan == ' ' ||
4170 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4171 scan += UTF8SKIP(scan);
4172 hardcount++;
4173 }
4174 } else {
4175 while (scan < loceol && isSPACE(*scan))
4176 scan++;
a0ed51b3
LW
4177 }
4178 break;
bbce6d69 4179 case SPACEL:
3280af22 4180 PL_reg_flags |= RF_tainted;
1aa99e6b 4181 if (do_utf8) {
ffc61ed2 4182 loceol = PL_regeol;
1aa99e6b 4183 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4184 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4185 scan += UTF8SKIP(scan);
4186 hardcount++;
4187 }
4188 } else {
4189 while (scan < loceol && isSPACE_LC(*scan))
4190 scan++;
a0ed51b3
LW
4191 }
4192 break;
a0d0e21e 4193 case NSPACE:
1aa99e6b 4194 if (do_utf8) {
ffc61ed2 4195 loceol = PL_regeol;
8269fa76 4196 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4197 while (hardcount < max && scan < loceol &&
3568d838
JH
4198 !(*scan == ' ' ||
4199 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4200 scan += UTF8SKIP(scan);
4201 hardcount++;
4202 }
4203 } else {
4204 while (scan < loceol && !isSPACE(*scan))
4205 scan++;
4206 break;
a0ed51b3 4207 }
bbce6d69 4208 case NSPACEL:
3280af22 4209 PL_reg_flags |= RF_tainted;
1aa99e6b 4210 if (do_utf8) {
ffc61ed2 4211 loceol = PL_regeol;
1aa99e6b 4212 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4213 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4214 scan += UTF8SKIP(scan);
4215 hardcount++;
4216 }
4217 } else {
4218 while (scan < loceol && !isSPACE_LC(*scan))
4219 scan++;
a0ed51b3
LW
4220 }
4221 break;
a0d0e21e 4222 case DIGIT:
1aa99e6b 4223 if (do_utf8) {
ffc61ed2 4224 loceol = PL_regeol;
8269fa76 4225 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4226 while (hardcount < max && scan < loceol &&
3568d838 4227 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4228 scan += UTF8SKIP(scan);
4229 hardcount++;
4230 }
4231 } else {
4232 while (scan < loceol && isDIGIT(*scan))
4233 scan++;
a0ed51b3
LW
4234 }
4235 break;
a0d0e21e 4236 case NDIGIT:
1aa99e6b 4237 if (do_utf8) {
ffc61ed2 4238 loceol = PL_regeol;
8269fa76 4239 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4240 while (hardcount < max && scan < loceol &&
3568d838 4241 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4242 scan += UTF8SKIP(scan);
4243 hardcount++;
4244 }
4245 } else {
4246 while (scan < loceol && !isDIGIT(*scan))
4247 scan++;
a0ed51b3
LW
4248 }
4249 break;
a0d0e21e
LW
4250 default: /* Called on something of 0 width. */
4251 break; /* So match right here or not at all. */
4252 }
a687059c 4253
a0ed51b3
LW
4254 if (hardcount)
4255 c = hardcount;
4256 else
4257 c = scan - PL_reginput;
3280af22 4258 PL_reginput = scan;
a687059c 4259
9041c2e3 4260 DEBUG_r(
c277df42
IZ
4261 {
4262 SV *prop = sv_newmortal();
4263
4264 regprop(prop, p);
9041c2e3
NIS
4265 PerlIO_printf(Perl_debug_log,
4266 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4267 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4268 });
9041c2e3 4269
a0d0e21e 4270 return(c);
a687059c
LW
4271}
4272
4273/*
c277df42 4274 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4275 *
c277df42
IZ
4276 * The repeater is supposed to have constant length.
4277 */
4278
76e3520e 4279STATIC I32
cea2e8a9 4280S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4281{
b7953727 4282 register char *scan = Nullch;
c277df42 4283 register char *start;
3280af22 4284 register char *loceol = PL_regeol;
a0ed51b3 4285 I32 l = 0;
708e3b05 4286 I32 count = 0, res = 1;
a0ed51b3
LW
4287
4288 if (!max)
4289 return 0;
c277df42 4290
3280af22 4291 start = PL_reginput;
53c4c00c 4292 if (PL_reg_match_utf8) {
708e3b05 4293 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4294 if (!count++) {
4295 l = 0;
4296 while (start < PL_reginput) {
4297 l++;
4298 start += UTF8SKIP(start);
4299 }
4300 *lp = l;
4301 if (l == 0)
4302 return max;
4303 }
4304 if (count == max)
4305 return count;
4306 }
4307 }
4308 else {
708e3b05 4309 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4310 if (!count++) {
4311 *lp = l = PL_reginput - start;
4312 if (max != REG_INFTY && l*max < loceol - scan)
4313 loceol = scan + l*max;
4314 if (l == 0)
4315 return max;
c277df42
IZ
4316 }
4317 }
4318 }
708e3b05 4319 if (!res)
3280af22 4320 PL_reginput = scan;
9041c2e3 4321
a0ed51b3 4322 return count;
c277df42
IZ
4323}
4324
4325/*
ffc61ed2
JH
4326- regclass_swash - prepare the utf8 swash
4327*/
4328
4329SV *
9e55ce06 4330Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4331{
9e55ce06
JH
4332 SV *sw = NULL;
4333 SV *si = NULL;
4334 SV *alt = NULL;
ffc61ed2
JH
4335
4336 if (PL_regdata && PL_regdata->count) {
4337 U32 n = ARG(node);
4338
4339 if (PL_regdata->what[n] == 's') {
4340 SV *rv = (SV*)PL_regdata->data[n];
4341 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4342 SV **ary = AvARRAY(av);
9e55ce06 4343 SV **a, **b;
9041c2e3 4344
9e55ce06
JH
4345 /* See the end of regcomp.c:S_reglass() for
4346 * documentation of these array elements. */
4347
b11f357e
JH
4348 si = *ary;
4349 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4350 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4351
ffc61ed2
JH
4352 if (a)
4353 sw = *a;
4354 else if (si && doinit) {
4355 sw = swash_init("utf8", "", si, 1, 0);
4356 (void)av_store(av, 1, sw);
4357 }
9e55ce06
JH
4358 if (b)
4359 alt = *b;
ffc61ed2
JH
4360 }
4361 }
4362
9e55ce06
JH
4363 if (listsvp)
4364 *listsvp = si;
4365 if (altsvp)
4366 *altsvp = alt;
ffc61ed2
JH
4367
4368 return sw;
4369}
4370
4371/*
ba7b4546 4372 - reginclass - determine if a character falls into a character class
832705d4
JH
4373
4374 The n is the ANYOF regnode, the p is the target string, lenp
4375 is pointer to the maximum length of how far to go in the p
4376 (if the lenp is zero, UTF8SKIP(p) is used),
4377 do_utf8 tells whether the target string is in UTF-8.
4378
bbce6d69 4379 */
4380
76e3520e 4381STATIC bool
ba7b4546 4382S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4383{
ffc61ed2 4384 char flags = ANYOF_FLAGS(n);
bbce6d69 4385 bool match = FALSE;
cc07378b 4386 UV c = *p;
ae9ddab8 4387 STRLEN len = 0;
9e55ce06 4388 STRLEN plen;
1aa99e6b 4389
ae9ddab8
JH
4390 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4391 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4392 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4393
0f0076b4 4394 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4395 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4396 if (lenp)
4397 *lenp = 0;
ffc61ed2 4398 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4399 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4400 match = TRUE;
bbce6d69 4401 }
3568d838 4402 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4403 match = TRUE;
ffc61ed2 4404 if (!match) {
9e55ce06
JH
4405 AV *av;
4406 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4407
4408 if (sw) {
3568d838 4409 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4410 match = TRUE;
4411 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4412 if (!match && lenp && av) {
4413 I32 i;
4414
4415 for (i = 0; i <= av_len(av); i++) {
4416 SV* sv = *av_fetch(av, i, FALSE);
4417 STRLEN len;
4418 char *s = SvPV(sv, len);
4419
061b10df 4420 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4421 *lenp = len;
4422 match = TRUE;
4423 break;
4424 }
4425 }
4426 }
4427 if (!match) {
4a623e43
JH
4428 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4429 STRLEN tmplen;
4430
9e55ce06
JH
4431 to_utf8_fold(p, tmpbuf, &tmplen);
4432 if (swash_fetch(sw, tmpbuf, do_utf8))
4433 match = TRUE;
4434 }
ffc61ed2
JH
4435 }
4436 }
bbce6d69 4437 }
9e55ce06 4438 if (match && lenp && *lenp == 0)
0f0076b4 4439 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4440 }
1aa99e6b 4441 if (!match && c < 256) {
ffc61ed2
JH
4442 if (ANYOF_BITMAP_TEST(n, c))
4443 match = TRUE;
4444 else if (flags & ANYOF_FOLD) {
eb160463 4445 U8 f;
a0ed51b3 4446
ffc61ed2
JH
4447 if (flags & ANYOF_LOCALE) {
4448 PL_reg_flags |= RF_tainted;
4449 f = PL_fold_locale[c];
4450 }
4451 else
4452 f = PL_fold[c];
4453 if (f != c && ANYOF_BITMAP_TEST(n, f))
4454 match = TRUE;
4455 }
4456
4457 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4458 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4459 if (
4460 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4481 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4482 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4483 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4484 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4485 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4486 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4487 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4488 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4489 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4490 ) /* How's that for a conditional? */
4491 {
4492 match = TRUE;
4493 }
a0ed51b3 4494 }
a0ed51b3
LW
4495 }
4496
a0ed51b3
LW
4497 return (flags & ANYOF_INVERT) ? !match : match;
4498}
161b471a 4499
dfe13c55 4500STATIC U8 *
cea2e8a9 4501S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4502{
1aa99e6b
IH
4503 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4504}
4505
4506STATIC U8 *
4507S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4508{
a0ed51b3 4509 if (off >= 0) {
1aa99e6b 4510 while (off-- && s < lim) {
ffc61ed2 4511 /* XXX could check well-formedness here */
a0ed51b3 4512 s += UTF8SKIP(s);
ffc61ed2 4513 }
a0ed51b3
LW
4514 }
4515 else {
4516 while (off++) {
1aa99e6b 4517 if (s > lim) {
a0ed51b3 4518 s--;
ffc61ed2 4519 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4520 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4521 s--;
ffc61ed2
JH
4522 }
4523 /* XXX could check well-formedness here */
a0ed51b3
LW
4524 }
4525 }
4526 }
4527 return s;
4528}
161b471a 4529
dfe13c55 4530STATIC U8 *
1aa99e6b 4531S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4532{
1aa99e6b
IH
4533 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4534}
4535
4536STATIC U8 *
4537S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4538{
4539 if (off >= 0) {
1aa99e6b 4540 while (off-- && s < lim) {
ffc61ed2 4541 /* XXX could check well-formedness here */
a0ed51b3 4542 s += UTF8SKIP(s);
ffc61ed2 4543 }
a0ed51b3
LW
4544 if (off >= 0)
4545 return 0;
4546 }
4547 else {
4548 while (off++) {
1aa99e6b 4549 if (s > lim) {
a0ed51b3 4550 s--;
ffc61ed2 4551 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4552 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4553 s--;
ffc61ed2
JH
4554 }
4555 /* XXX could check well-formedness here */
a0ed51b3
LW
4556 }
4557 else
4558 break;
4559 }
4560 if (off <= 0)
4561 return 0;
4562 }
4563 return s;
4564}
51371543 4565
51371543 4566static void
acfe0abc 4567restore_pos(pTHX_ void *arg)
51371543 4568{
51371543
GS
4569 if (PL_reg_eval_set) {
4570 if (PL_reg_oldsaved) {
4571 PL_reg_re->subbeg = PL_reg_oldsaved;
4572 PL_reg_re->sublen = PL_reg_oldsavedlen;
ed252734
NC
4573#ifdef PERL_COPY_ON_WRITE
4574 PL_reg_re->saved_copy = PL_nrs;
4575#endif
51371543
GS
4576 RX_MATCH_COPIED_on(PL_reg_re);
4577 }
4578 PL_reg_magic->mg_len = PL_reg_oldpos;
4579 PL_reg_eval_set = 0;
4580 PL_curpm = PL_reg_oldcurpm;
4581 }
4582}
33b8afdf
JH
4583
4584STATIC void
4585S_to_utf8_substr(pTHX_ register regexp *prog)
4586{
4587 SV* sv;
4588 if (prog->float_substr && !prog->float_utf8) {
4589 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4590 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4591 sv_utf8_upgrade(sv);
4592 if (SvTAIL(prog->float_substr))
4593 SvTAIL_on(sv);
4594 if (prog->float_substr == prog->check_substr)
4595 prog->check_utf8 = sv;
4596 }
4597 if (prog->anchored_substr && !prog->anchored_utf8) {
4598 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4599 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4600 sv_utf8_upgrade(sv);
4601 if (SvTAIL(prog->anchored_substr))
4602 SvTAIL_on(sv);
4603 if (prog->anchored_substr == prog->check_substr)
4604 prog->check_utf8 = sv;
4605 }
4606}
4607
4608STATIC void
4609S_to_byte_substr(pTHX_ register regexp *prog)
4610{
4611 SV* sv;
4612 if (prog->float_utf8 && !prog->float_substr) {
4613 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4614 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4615 if (sv_utf8_downgrade(sv, TRUE)) {
4616 if (SvTAIL(prog->float_utf8))
4617 SvTAIL_on(sv);
4618 } else {
4619 SvREFCNT_dec(sv);
4620 prog->float_substr = sv = &PL_sv_undef;
4621 }
4622 if (prog->float_utf8 == prog->check_utf8)
4623 prog->check_substr = sv;
4624 }
4625 if (prog->anchored_utf8 && !prog->anchored_substr) {
4626 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4627 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4628 if (sv_utf8_downgrade(sv, TRUE)) {
4629 if (SvTAIL(prog->anchored_utf8))
4630 SvTAIL_on(sv);
4631 } else {
4632 SvREFCNT_dec(sv);
4633 prog->anchored_substr = sv = &PL_sv_undef;
4634 }
4635 if (prog->anchored_utf8 == prog->check_utf8)
4636 prog->check_substr = sv;
4637 }
4638}