This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
operation on `PL_na' may be undefined
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
be3c0a43 70 **** Copyright (c) 1991-2002, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
eb160463 90#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
ba7b4546 99#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 100
c277df42
IZ
101/*
102 * Forwards.
103 */
104
33b8afdf 105#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 106#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 107
dfe13c55
GS
108#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
109#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
110#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
111#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
112#define HOPc(pos,off) ((char*)HOP(pos,off))
113#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 114
efb30f32 115#define HOPBACK(pos, off) ( \
e54858b0 116 (PL_reg_match_utf8) \
efb30f32
HS
117 ? reghopmaybe((U8*)pos, -off) \
118 : (pos - off >= PL_bostr) \
119 ? (U8*)(pos - off) \
120 : (U8*)NULL \
121)
122#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
123
1aa99e6b
IH
124#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
125#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
127#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
130
977e4b84 131#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 132
5f80c4cf 133/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
134#define JUMPABLE(rn) ( \
135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
137 OP(rn) == PLUS || OP(rn) == MINMOD || \
138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
139)
140
cca55fe3
JP
141#define HAS_TEXT(rn) ( \
142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
143)
e2d8ce26 144
a84d97b6
HS
145/*
146 Search for mandatory following text node; for lookahead, the text must
147 follow but for lookbehind (rn->flags != 0) we skip to the next step.
148*/
cca55fe3 149#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 150 while (JUMPABLE(rn)) \
a84d97b6 151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 152 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
153 else if (OP(rn) == PLUS) \
154 rn = NEXTOPER(rn); \
a84d97b6
HS
155 else if (OP(rn) == IFMATCH) \
156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 157 else rn += NEXT_OFF(rn); \
5f80c4cf 158} STMT_END
74750237 159
acfe0abc 160static void restore_pos(pTHX_ void *arg);
51371543 161
76e3520e 162STATIC CHECKPOINT
cea2e8a9 163S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 164{
3280af22 165 int retval = PL_savestack_ix;
b1ce53c5
JH
166#define REGCP_PAREN_ELEMS 4
167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
168 int p;
169
e49a9654
IH
170 if (paren_elems_to_push < 0)
171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
172
a01268b5 173#define REGCP_OTHER_ELEMS 6
b1ce53c5 174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 175 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 176/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
177 SSPUSHINT(PL_regendp[p]);
178 SSPUSHINT(PL_regstartp[p]);
3280af22 179 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
180 SSPUSHINT(p);
181 }
b1ce53c5 182/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
183 SSPUSHINT(PL_regsize);
184 SSPUSHINT(*PL_reglastparen);
a01268b5 185 SSPUSHINT(*PL_reglastcloseparen);
3280af22 186 SSPUSHPTR(PL_reginput);
41123dfd
JH
187#define REGCP_FRAME_ELEMS 2
188/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
189 * are needed for the regexp context stack bookkeeping. */
190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 192
a0d0e21e
LW
193 return retval;
194}
195
c277df42 196/* These are needed since we do not localize EVAL nodes: */
02db2b7b 197# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 199 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 200
02db2b7b 201# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 202 PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 205
76e3520e 206STATIC char *
cea2e8a9 207S_regcppop(pTHX)
a0d0e21e 208{
b1ce53c5 209 I32 i;
a0d0e21e
LW
210 U32 paren = 0;
211 char *input;
cf93c79d 212 I32 tmps;
b1ce53c5
JH
213
214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 215 i = SSPOPINT;
b1ce53c5
JH
216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
217 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 218 input = (char *) SSPOPPTR;
a01268b5 219 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
220 *PL_reglastparen = SSPOPINT;
221 PL_regsize = SSPOPINT;
b1ce53c5
JH
222
223 /* Now restore the parentheses context. */
41123dfd
JH
224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
225 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 226 paren = (U32)SSPOPINT;
3280af22 227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
228 PL_regstartp[paren] = SSPOPINT;
229 tmps = SSPOPINT;
3280af22
NIS
230 if (paren <= *PL_reglastparen)
231 PL_regendp[paren] = tmps;
c277df42 232 DEBUG_r(
c3464db5 233 PerlIO_printf(Perl_debug_log,
b900a521 234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 235 (UV)paren, (IV)PL_regstartp[paren],
b900a521 236 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 237 (IV)PL_regendp[paren],
3280af22 238 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 239 );
a0d0e21e 240 }
c277df42 241 DEBUG_r(
eb160463 242 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 243 PerlIO_printf(Perl_debug_log,
faccc32b
JH
244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
246 }
247 );
daf18116 248#if 1
dafc8851
JH
249 /* It would seem that the similar code in regtry()
250 * already takes care of this, and in fact it is in
251 * a better location to since this code can #if 0-ed out
252 * but the code in regtry() is needed or otherwise tests
253 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
254 * (as of patchlevel 7877) will fail. Then again,
255 * this code seems to be necessary or otherwise
256 * building DynaLoader will fail:
257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
258 * --jhi */
eb160463
GS
259 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
260 if ((I32)paren > PL_regsize)
cf93c79d
IZ
261 PL_regstartp[paren] = -1;
262 PL_regendp[paren] = -1;
a0d0e21e 263 }
dafc8851 264#endif
a0d0e21e
LW
265 return input;
266}
267
0f5d15d6 268STATIC char *
cea2e8a9 269S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
270{
271 I32 tmp = PL_savestack_ix;
272
273 PL_savestack_ix = ss;
274 regcppop();
275 PL_savestack_ix = tmp;
942e002e 276 return Nullch;
0f5d15d6
IZ
277}
278
279typedef struct re_cc_state
280{
281 I32 ss;
282 regnode *node;
283 struct re_cc_state *prev;
284 CURCUR *cc;
285 regexp *re;
286} re_cc_state;
287
02db2b7b 288#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 289
29d1e993
HS
290#define TRYPAREN(paren, n, input) { \
291 if (paren) { \
292 if (n) { \
293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
294 PL_regendp[paren] = input - PL_bostr; \
295 } \
296 else \
297 PL_regendp[paren] = -1; \
298 } \
299 if (regmatch(next)) \
300 sayYES; \
301 if (paren && n) \
302 PL_regendp[paren] = -1; \
303}
304
305
a687059c 306/*
e50aee73 307 * pregexec and friends
a687059c
LW
308 */
309
310/*
c277df42 311 - pregexec - match a regexp against a string
a687059c 312 */
c277df42 313I32
864dbfa3 314Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 315 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
316/* strend: pointer to null at end of string */
317/* strbeg: real beginning of string */
318/* minend: end of match must be >=minend after stringarg. */
319/* nosave: For optimizations. */
320{
321 return
9041c2e3 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
323 nosave ? 0 : REXEC_COPY_STR);
324}
0f5d15d6
IZ
325
326STATIC void
cea2e8a9 327S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
328{
329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
330#ifdef DEBUGGING
331 PL_regprogram = prog->program;
332#endif
333 PL_regnpar = prog->nparens;
9041c2e3
NIS
334 PL_regdata = prog->data;
335 PL_reg_re = prog;
0f5d15d6 336}
22e551b9 337
9041c2e3 338/*
cad2e5aa
JH
339 * Need to implement the following flags for reg_anch:
340 *
341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
342 * USE_INTUIT_ML
343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
344 * INTUIT_AUTORITATIVE_ML
345 * INTUIT_ONCE_NOML - Intuit can match in one location only.
346 * INTUIT_ONCE_ML
347 *
348 * Another flag for this function: SECOND_TIME (so that float substrs
349 * with giant delta may be not rechecked).
350 */
351
352/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
353
2c2d71f5 354/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
355 Otherwise, only SvCUR(sv) is used to get strbeg. */
356
357/* XXXX We assume that strpos is strbeg unless sv. */
358
6eb5f6b9
JH
359/* XXXX Some places assume that there is a fixed substring.
360 An update may be needed if optimizer marks as "INTUITable"
361 RExen without fixed substrings. Similarly, it is assumed that
362 lengths of all the strings are no more than minlen, thus they
363 cannot come from lookahead.
364 (Or minlen should take into account lookahead.) */
365
2c2d71f5
JH
366/* A failure to find a constant substring means that there is no need to make
367 an expensive call to REx engine, thus we celebrate a failure. Similarly,
368 finding a substring too deep into the string means that less calls to
30944b6d
IZ
369 regtry() should be needed.
370
371 REx compiler's optimizer found 4 possible hints:
372 a) Anchored substring;
373 b) Fixed substring;
374 c) Whether we are anchored (beginning-of-line or \G);
375 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 376 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
377 string which does not contradict any of them.
378 */
2c2d71f5 379
6eb5f6b9
JH
380/* Most of decisions we do here should have been done at compile time.
381 The nodes of the REx which we used for the search should have been
382 deleted from the finite automaton. */
383
cad2e5aa
JH
384char *
385Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
386 char *strend, U32 flags, re_scream_pos_data *data)
387{
b7953727 388 register I32 start_shift = 0;
cad2e5aa 389 /* Should be nonnegative! */
b7953727 390 register I32 end_shift = 0;
2c2d71f5
JH
391 register char *s;
392 register SV *check;
a1933d95 393 char *strbeg;
cad2e5aa 394 char *t;
33b8afdf 395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 396 I32 ml_anch;
6eb5f6b9 397 register char *other_last = Nullch; /* other substr checked before this */
b7953727 398 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
399#ifdef DEBUGGING
400 char *i_strpos = strpos;
ce333219 401 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 402#endif
cad2e5aa 403
b8d68ded
JH
404 if (prog->reganch & ROPT_UTF8) {
405 DEBUG_r(PerlIO_printf(Perl_debug_log,
406 "UTF-8 regex...\n"));
407 PL_reg_flags |= RF_utf8;
408 }
409
2a782b5b 410 DEBUG_r({
b8d68ded 411 char *s = PL_reg_match_utf8 ?
c728cb41
JH
412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
413 strpos;
b8d68ded
JH
414 int len = PL_reg_match_utf8 ?
415 strlen(s) : strend - strpos;
2a782b5b
JH
416 if (!PL_colorset)
417 reginitcolors();
b8d68ded
JH
418 if (PL_reg_match_utf8)
419 DEBUG_r(PerlIO_printf(Perl_debug_log,
420 "UTF-8 target...\n"));
2a782b5b
JH
421 PerlIO_printf(Perl_debug_log,
422 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
423 PL_colors[4],PL_colors[5],PL_colors[0],
424 prog->precomp,
425 PL_colors[1],
426 (strlen(prog->precomp) > 60 ? "..." : ""),
427 PL_colors[0],
428 (int)(len > 60 ? 60 : len),
429 s, PL_colors[1],
430 (len > 60 ? "..." : "")
431 );
432 });
cad2e5aa 433
c344f387
JH
434 /* CHR_DIST() would be more correct here but it makes things slow. */
435 if (prog->minlen > strend - strpos) {
a72c7584
JH
436 DEBUG_r(PerlIO_printf(Perl_debug_log,
437 "String too short... [re_intuit_start]\n"));
cad2e5aa 438 goto fail;
2c2d71f5 439 }
a1933d95 440 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 441 PL_regeol = strend;
33b8afdf
JH
442 if (do_utf8) {
443 if (!prog->check_utf8 && prog->check_substr)
444 to_utf8_substr(prog);
445 check = prog->check_utf8;
446 } else {
447 if (!prog->check_substr && prog->check_utf8)
448 to_byte_substr(prog);
449 check = prog->check_substr;
450 }
451 if (check == &PL_sv_undef) {
452 DEBUG_r(PerlIO_printf(Perl_debug_log,
453 "Non-utf string cannot match utf check string\n"));
454 goto fail;
455 }
2c2d71f5 456 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
457 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
458 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 459 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 460
7e25d62c
JH
461 if (!ml_anch) {
462 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
463 | ROPT_IMPLICIT)) /* not a real BOL */
464 /* SvCUR is not set on references: SvRV and SvPVX overlap */
465 && sv && !SvROK(sv)
466 && (strpos != strbeg)) {
467 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
468 goto fail;
469 }
470 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 471 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 472 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
473 I32 slen;
474
1aa99e6b 475 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
476 if (SvTAIL(check)) {
477 slen = SvCUR(check); /* >= 1 */
cad2e5aa 478
9041c2e3 479 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
480 || (strend - s == slen && strend[-1] != '\n')) {
481 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
482 goto fail_finish;
cad2e5aa
JH
483 }
484 /* Now should match s[0..slen-2] */
485 slen--;
653099ff 486 if (slen && (*SvPVX(check) != *s
cad2e5aa 487 || (slen > 1
653099ff 488 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
489 report_neq:
490 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
491 goto fail_finish;
492 }
cad2e5aa 493 }
653099ff
GS
494 else if (*SvPVX(check) != *s
495 || ((slen = SvCUR(check)) > 1
496 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
497 goto report_neq;
498 goto success_at_start;
7e25d62c 499 }
cad2e5aa 500 }
2c2d71f5 501 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 502 s = strpos;
2c2d71f5 503 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 504 end_shift = prog->minlen - start_shift -
653099ff 505 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 506 if (!ml_anch) {
653099ff
GS
507 I32 end = prog->check_offset_max + CHR_SVLEN(check)
508 - (SvTAIL(check) != 0);
1aa99e6b 509 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
510
511 if (end_shift < eshift)
512 end_shift = eshift;
513 }
cad2e5aa 514 }
2c2d71f5 515 else { /* Can match at random position */
cad2e5aa
JH
516 ml_anch = 0;
517 s = strpos;
2c2d71f5
JH
518 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
519 /* Should be nonnegative! */
520 end_shift = prog->minlen - start_shift -
653099ff 521 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
522 }
523
2c2d71f5 524#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 525 if (end_shift < 0)
6bbae5e6 526 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
527#endif
528
2c2d71f5
JH
529 restart:
530 /* Find a possible match in the region s..strend by looking for
531 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 532 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
533 I32 p = -1; /* Internal iterator of scream. */
534 I32 *pp = data ? data->scream_pos : &p;
535
2c2d71f5
JH
536 if (PL_screamfirst[BmRARE(check)] >= 0
537 || ( BmRARE(check) == '\n'
538 && (BmPREVIOUS(check) == SvCUR(check) - 1)
539 && SvTAIL(check) ))
9041c2e3 540 s = screaminstr(sv, check,
2c2d71f5 541 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 542 else
2c2d71f5 543 goto fail_finish;
cad2e5aa
JH
544 if (data)
545 *data->scream_olds = s;
546 }
f33976b4 547 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
548 s = fbm_instr((U8*)(s + start_shift),
549 (U8*)(strend - end_shift),
550 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 551 else
1aa99e6b
IH
552 s = fbm_instr(HOP3(s, start_shift, strend),
553 HOP3(strend, -end_shift, strbeg),
2c2d71f5 554 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
555
556 /* Update the count-of-usability, remove useless subpatterns,
557 unshift s. */
2c2d71f5
JH
558
559 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
560 (s ? "Found" : "Did not find"),
33b8afdf 561 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 562 PL_colors[0],
7b0972df
JH
563 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
564 SvPVX(check),
2c2d71f5
JH
565 PL_colors[1], (SvTAIL(check) ? "$" : ""),
566 (s ? " at offset " : "...\n") ) );
567
568 if (!s)
569 goto fail_finish;
570
6eb5f6b9
JH
571 check_at = s;
572
2c2d71f5 573 /* Finish the diagnostic message */
30944b6d 574 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
575
576 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
577 Start with the other substr.
578 XXXX no SCREAM optimization yet - and a very coarse implementation
579 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
580 *always* match. Probably should be marked during compile...
581 Probably it is right to do no SCREAM here...
582 */
583
33b8afdf 584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 585 /* Take into account the "other" substring. */
2c2d71f5
JH
586 /* XXXX May be hopelessly wrong for UTF... */
587 if (!other_last)
6eb5f6b9 588 other_last = strpos;
33b8afdf 589 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
590 do_other_anchored:
591 {
1aa99e6b 592 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 593 char *s1 = s;
33b8afdf 594 SV* must;
2c2d71f5 595
2c2d71f5
JH
596 t = s - prog->check_offset_max;
597 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 598 && (!do_utf8
1aa99e6b 599 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 600 && t > strpos)))
30944b6d 601 /* EMPTY */;
2c2d71f5
JH
602 else
603 t = strpos;
1aa99e6b 604 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
605 if (t < other_last) /* These positions already checked */
606 t = other_last;
1aa99e6b 607 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
608 if (last < last1)
609 last1 = last;
610 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
611 /* On end-of-str: see comment below. */
33b8afdf
JH
612 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
613 if (must == &PL_sv_undef) {
614 s = (char*)NULL;
615 DEBUG_r(must = prog->anchored_utf8); /* for debug */
616 }
617 else
618 s = fbm_instr(
619 (unsigned char*)t,
620 HOP3(HOP3(last1, prog->anchored_offset, strend)
621 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
622 must,
623 PL_multiline ? FBMrf_MULTILINE : 0
624 );
1aa99e6b
IH
625 DEBUG_r(PerlIO_printf(Perl_debug_log,
626 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
627 (s ? "Found" : "Contradicts"),
628 PL_colors[0],
33b8afdf
JH
629 (int)(SvCUR(must)
630 - (SvTAIL(must)!=0)),
631 SvPVX(must),
632 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
633 if (!s) {
634 if (last1 >= last2) {
635 DEBUG_r(PerlIO_printf(Perl_debug_log,
636 ", giving up...\n"));
637 goto fail_finish;
638 }
639 DEBUG_r(PerlIO_printf(Perl_debug_log,
640 ", trying floating at offset %ld...\n",
1aa99e6b
IH
641 (long)(HOP3c(s1, 1, strend) - i_strpos)));
642 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
643 s = HOP3c(last, 1, strend);
2c2d71f5
JH
644 goto restart;
645 }
646 else {
647 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 648 (long)(s - i_strpos)));
1aa99e6b
IH
649 t = HOP3c(s, -prog->anchored_offset, strbeg);
650 other_last = HOP3c(s, 1, strend);
30944b6d 651 s = s1;
2c2d71f5
JH
652 if (t == strpos)
653 goto try_at_start;
2c2d71f5
JH
654 goto try_at_offset;
655 }
30944b6d 656 }
2c2d71f5
JH
657 }
658 else { /* Take into account the floating substring. */
33b8afdf
JH
659 char *last, *last1;
660 char *s1 = s;
661 SV* must;
662
663 t = HOP3c(s, -start_shift, strbeg);
664 last1 = last =
665 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
666 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
667 last = HOP3c(t, prog->float_max_offset, strend);
668 s = HOP3c(t, prog->float_min_offset, strend);
669 if (s < other_last)
670 s = other_last;
2c2d71f5 671 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
672 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
673 /* fbm_instr() takes into account exact value of end-of-str
674 if the check is SvTAIL(ed). Since false positives are OK,
675 and end-of-str is not later than strend we are OK. */
676 if (must == &PL_sv_undef) {
677 s = (char*)NULL;
678 DEBUG_r(must = prog->float_utf8); /* for debug message */
679 }
680 else
2c2d71f5 681 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
682 (unsigned char*)last + SvCUR(must)
683 - (SvTAIL(must)!=0),
684 must, PL_multiline ? FBMrf_MULTILINE : 0);
685 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
686 (s ? "Found" : "Contradicts"),
687 PL_colors[0],
688 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
689 SvPVX(must),
690 PL_colors[1], (SvTAIL(must) ? "$" : "")));
691 if (!s) {
692 if (last1 == last) {
2c2d71f5 693 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
694 ", giving up...\n"));
695 goto fail_finish;
2c2d71f5 696 }
33b8afdf
JH
697 DEBUG_r(PerlIO_printf(Perl_debug_log,
698 ", trying anchored starting at offset %ld...\n",
699 (long)(s1 + 1 - i_strpos)));
700 other_last = last;
701 s = HOP3c(t, 1, strend);
702 goto restart;
703 }
704 else {
705 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
706 (long)(s - i_strpos)));
707 other_last = s; /* Fix this later. --Hugo */
708 s = s1;
709 if (t == strpos)
710 goto try_at_start;
711 goto try_at_offset;
712 }
2c2d71f5 713 }
cad2e5aa 714 }
2c2d71f5
JH
715
716 t = s - prog->check_offset_max;
2c2d71f5 717 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 718 && (!do_utf8
1aa99e6b
IH
719 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
720 && t > strpos))) {
2c2d71f5
JH
721 /* Fixed substring is found far enough so that the match
722 cannot start at strpos. */
723 try_at_offset:
cad2e5aa 724 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
725 /* Eventually fbm_*() should handle this, but often
726 anchored_offset is not 0, so this check will not be wasted. */
727 /* XXXX In the code below we prefer to look for "^" even in
728 presence of anchored substrings. And we search even
729 beyond the found float position. These pessimizations
730 are historical artefacts only. */
731 find_anchor:
2c2d71f5 732 while (t < strend - prog->minlen) {
cad2e5aa 733 if (*t == '\n') {
4ee3650e 734 if (t < check_at - prog->check_offset_min) {
33b8afdf 735 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
736 /* Since we moved from the found position,
737 we definitely contradict the found anchored
30944b6d
IZ
738 substr. Due to the above check we do not
739 contradict "check" substr.
740 Thus we can arrive here only if check substr
741 is float. Redo checking for "other"=="fixed".
742 */
9041c2e3 743 strpos = t + 1;
30944b6d
IZ
744 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
745 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
746 goto do_other_anchored;
747 }
4ee3650e
GS
748 /* We don't contradict the found floating substring. */
749 /* XXXX Why not check for STCLASS? */
cad2e5aa 750 s = t + 1;
2c2d71f5 751 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 752 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
753 goto set_useful;
754 }
4ee3650e
GS
755 /* Position contradicts check-string */
756 /* XXXX probably better to look for check-string
757 than for "\n", so one should lower the limit for t? */
758 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 759 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 760 other_last = strpos = s = t + 1;
cad2e5aa
JH
761 goto restart;
762 }
763 t++;
764 }
2c2d71f5
JH
765 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
766 PL_colors[0],PL_colors[1]));
767 goto fail_finish;
cad2e5aa 768 }
f5952150
GS
769 else {
770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
771 PL_colors[0],PL_colors[1]));
772 }
cad2e5aa
JH
773 s = t;
774 set_useful:
33b8afdf 775 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
776 }
777 else {
f5952150 778 /* The found string does not prohibit matching at strpos,
2c2d71f5 779 - no optimization of calling REx engine can be performed,
f5952150
GS
780 unless it was an MBOL and we are not after MBOL,
781 or a future STCLASS check will fail this. */
2c2d71f5
JH
782 try_at_start:
783 /* Even in this situation we may use MBOL flag if strpos is offset
784 wrt the start of the string. */
05b4157f 785 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 786 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
787 /* May be due to an implicit anchor of m{.*foo} */
788 && !(prog->reganch & ROPT_IMPLICIT))
789 {
cad2e5aa
JH
790 t = strpos;
791 goto find_anchor;
792 }
30944b6d 793 DEBUG_r( if (ml_anch)
f5952150
GS
794 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
795 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 796 );
2c2d71f5 797 success_at_start:
30944b6d 798 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
799 && (do_utf8 ? (
800 prog->check_utf8 /* Could be deleted already */
801 && --BmUSEFUL(prog->check_utf8) < 0
802 && (prog->check_utf8 == prog->float_utf8)
803 ) : (
804 prog->check_substr /* Could be deleted already */
805 && --BmUSEFUL(prog->check_substr) < 0
806 && (prog->check_substr == prog->float_substr)
807 )))
66e933ab 808 {
cad2e5aa 809 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 810 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
811 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
812 if (do_utf8 ? prog->check_substr : prog->check_utf8)
813 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
814 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
815 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 816 check = Nullsv; /* abort */
cad2e5aa 817 s = strpos;
3cf5c195
IZ
818 /* XXXX This is a remnant of the old implementation. It
819 looks wasteful, since now INTUIT can use many
6eb5f6b9 820 other heuristics. */
cad2e5aa
JH
821 prog->reganch &= ~RE_USE_INTUIT;
822 }
823 else
824 s = strpos;
825 }
826
6eb5f6b9
JH
827 /* Last resort... */
828 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
829 if (prog->regstclass) {
830 /* minlen == 0 is possible if regstclass is \b or \B,
831 and the fixed substr is ''$.
832 Since minlen is already taken into account, s+1 is before strend;
833 accidentally, minlen >= 1 guaranties no false positives at s + 1
834 even for \b or \B. But (minlen? 1 : 0) below assumes that
835 regstclass does not come from lookahead... */
836 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
837 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 838 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 839 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 840 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 841 : 1);
33b8afdf 842 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 843 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 844 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
845 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
846 cl_l, strend)
847 : strend);
a1933d95 848 char *startpos = strbeg;
6eb5f6b9
JH
849
850 t = s;
76384e4a 851 if (prog->reganch & ROPT_UTF8) {
ffc61ed2 852 PL_regdata = prog->data;
76384e4a
GS
853 PL_bostr = startpos;
854 }
f33976b4 855 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
856 if (!s) {
857#ifdef DEBUGGING
b7953727 858 char *what = 0;
6eb5f6b9
JH
859#endif
860 if (endpos == strend) {
861 DEBUG_r( PerlIO_printf(Perl_debug_log,
862 "Could not match STCLASS...\n") );
863 goto fail;
864 }
66e933ab
GS
865 DEBUG_r( PerlIO_printf(Perl_debug_log,
866 "This position contradicts STCLASS...\n") );
653099ff
GS
867 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
868 goto fail;
6eb5f6b9 869 /* Contradict one of substrings */
33b8afdf
JH
870 if (prog->anchored_substr || prog->anchored_utf8) {
871 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
872 DEBUG_r( what = "anchored" );
873 hop_and_restart:
1aa99e6b 874 s = HOP3c(t, 1, strend);
66e933ab
GS
875 if (s + start_shift + end_shift > strend) {
876 /* XXXX Should be taken into account earlier? */
877 DEBUG_r( PerlIO_printf(Perl_debug_log,
878 "Could not match STCLASS...\n") );
879 goto fail;
880 }
5e39e1e5
HS
881 if (!check)
882 goto giveup;
6eb5f6b9 883 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 884 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
885 what, (long)(s + start_shift - i_strpos)) );
886 goto restart;
887 }
66e933ab 888 /* Have both, check_string is floating */
6eb5f6b9
JH
889 if (t + start_shift >= check_at) /* Contradicts floating=check */
890 goto retry_floating_check;
891 /* Recheck anchored substring, but not floating... */
9041c2e3 892 s = check_at;
5e39e1e5
HS
893 if (!check)
894 goto giveup;
6eb5f6b9 895 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 896 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
897 (long)(other_last - i_strpos)) );
898 goto do_other_anchored;
899 }
60e71179
GS
900 /* Another way we could have checked stclass at the
901 current position only: */
902 if (ml_anch) {
903 s = t = t + 1;
5e39e1e5
HS
904 if (!check)
905 goto giveup;
60e71179 906 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
907 "Looking for /%s^%s/m starting at offset %ld...\n",
908 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 909 goto try_at_offset;
66e933ab 910 }
33b8afdf 911 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 912 goto fail;
6eb5f6b9
JH
913 /* Check is floating subtring. */
914 retry_floating_check:
915 t = check_at - start_shift;
916 DEBUG_r( what = "floating" );
917 goto hop_and_restart;
918 }
b7953727
JH
919 if (t != s) {
920 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 921 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
922 (long)(t - i_strpos), (long)(s - i_strpos))
923 );
924 }
925 else {
926 DEBUG_r(PerlIO_printf(Perl_debug_log,
927 "Does not contradict STCLASS...\n");
928 );
929 }
6eb5f6b9 930 }
5e39e1e5
HS
931 giveup:
932 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
933 PL_colors[4], (check ? "Guessed" : "Giving up"),
934 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 935 return s;
2c2d71f5
JH
936
937 fail_finish: /* Substring not found */
33b8afdf
JH
938 if (prog->check_substr || prog->check_utf8) /* could be removed already */
939 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 940 fail:
2c2d71f5 941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
942 PL_colors[4],PL_colors[5]));
943 return Nullch;
944}
9661b544 945
6eb5f6b9 946/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
947STATIC char *
948S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 949{
6eb5f6b9
JH
950 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
951 char *m;
d8093b23
G
952 STRLEN ln;
953 unsigned int c1;
954 unsigned int c2;
6eb5f6b9
JH
955 char *e;
956 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 957 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 958
6eb5f6b9
JH
959 /* We know what class it must start with. */
960 switch (OP(c)) {
6eb5f6b9 961 case ANYOF:
a0ed51b3 962 while (s < strend) {
7d3e948e
JH
963 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
964
965 if (do_utf8 ?
ba7b4546 966 reginclass(c, (U8*)s, 0, do_utf8) :
7d3e948e 967 REGINCLASS(c, (U8*)s) ||
ebc501f0
JH
968 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
969 /* The assignment of 2 is intentional:
970 * for the sharp s, the skip is 2. */
971 (skip = SHARP_S_SKIP)
972 )) {
6eb5f6b9
JH
973 if (tmp && (norun || regtry(prog, s)))
974 goto got_it;
975 else
976 tmp = doevery;
a0ed51b3 977 }
e0f9d4a8 978 else
7d3e948e 979 tmp = 1;
e0f9d4a8 980 s += skip;
a0d0e21e 981 }
6eb5f6b9 982 break;
f33976b4
DB
983 case CANY:
984 while (s < strend) {
985 if (tmp && (norun || regtry(prog, s)))
986 goto got_it;
987 else
988 tmp = doevery;
989 s++;
990 }
991 break;
6eb5f6b9
JH
992 case EXACTF:
993 m = STRING(c);
994 ln = STR_LEN(c);
1aa99e6b 995 if (UTF) {
a2a2844f 996 STRLEN ulen1, ulen2;
e7ae6809
JH
997 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
998 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
999
1000 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1001 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1002
872c91ae
JH
1003 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1004 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1005 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1006 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1007 }
1008 else {
1009 c1 = *(U8*)m;
1010 c2 = PL_fold[c1];
1011 }
6eb5f6b9
JH
1012 goto do_exactf;
1013 case EXACTFL:
1014 m = STRING(c);
1015 ln = STR_LEN(c);
d8093b23 1016 c1 = *(U8*)m;
6eb5f6b9
JH
1017 c2 = PL_fold_locale[c1];
1018 do_exactf:
6795a346 1019 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1020
6eb5f6b9
JH
1021 if (norun && e < s)
1022 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1023
60a8b682
JH
1024 /* The idea in the EXACTF* cases is to first find the
1025 * first character of the EXACTF* node and then, if
1026 * necessary, case-insensitively compare the full
1027 * text of the node. The c1 and c2 are the first
1028 * characters (though in Unicode it gets a bit
1029 * more complicated because there are more cases
7f16dd3d
JH
1030 * than just upper and lower: one needs to use
1031 * the so-called folding case for case-insensitive
1032 * matching (called "loose matching" in Unicode).
1033 * ibcmp_utf8() will do just that. */
60a8b682 1034
1aa99e6b 1035 if (do_utf8) {
575cac57
JH
1036 UV c, f;
1037 U8 tmpbuf [UTF8_MAXLEN+1];
1038 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1039 STRLEN len, foldlen;
d7f013c8 1040
09091399 1041 if (c1 == c2) {
1aa99e6b 1042 while (s <= e) {
872c91ae
JH
1043 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1044 ckWARN(WARN_UTF8) ?
1045 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1046 if ( c == c1
1047 && (ln == len ||
66423254 1048 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1049 m, (char **)0, ln, (bool)UTF))
55da9344 1050 && (norun || regtry(prog, s)) )
1aa99e6b 1051 goto got_it;
80aecb99
JH
1052 else {
1053 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1054 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1055 if ( f != c
1056 && (f == c1 || f == c2)
1057 && (ln == foldlen ||
66423254
JH
1058 !ibcmp_utf8((char *) foldbuf,
1059 (char **)0, foldlen, do_utf8,
d07ddd77 1060 m,
eb160463 1061 (char **)0, ln, (bool)UTF))
80aecb99
JH
1062 && (norun || regtry(prog, s)) )
1063 goto got_it;
1064 }
1aa99e6b
IH
1065 s += len;
1066 }
09091399
JH
1067 }
1068 else {
1aa99e6b 1069 while (s <= e) {
872c91ae
JH
1070 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1071 ckWARN(WARN_UTF8) ?
1072 0 : UTF8_ALLOW_ANY);
80aecb99 1073
60a8b682 1074 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1075 * Note that not all the possible combinations
1076 * are handled here: some of them are handled
1077 * by the standard folding rules, and some of
1078 * them (the character class or ANYOF cases)
1079 * are handled during compiletime in
1080 * regexec.c:S_regclass(). */
880bd946
JH
1081 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1082 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1083 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1084
1085 if ( (c == c1 || c == c2)
1086 && (ln == len ||
66423254 1087 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1088 m, (char **)0, ln, (bool)UTF))
55da9344 1089 && (norun || regtry(prog, s)) )
1aa99e6b 1090 goto got_it;
80aecb99
JH
1091 else {
1092 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1093 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1094 if ( f != c
1095 && (f == c1 || f == c2)
1096 && (ln == foldlen ||
a6872d42 1097 !ibcmp_utf8((char *) foldbuf,
66423254 1098 (char **)0, foldlen, do_utf8,
d07ddd77 1099 m,
eb160463 1100 (char **)0, ln, (bool)UTF))
80aecb99
JH
1101 && (norun || regtry(prog, s)) )
1102 goto got_it;
1103 }
1aa99e6b
IH
1104 s += len;
1105 }
09091399 1106 }
1aa99e6b
IH
1107 }
1108 else {
1109 if (c1 == c2)
1110 while (s <= e) {
1111 if ( *(U8*)s == c1
1112 && (ln == 1 || !(OP(c) == EXACTF
1113 ? ibcmp(s, m, ln)
1114 : ibcmp_locale(s, m, ln)))
1115 && (norun || regtry(prog, s)) )
1116 goto got_it;
1117 s++;
1118 }
1119 else
1120 while (s <= e) {
1121 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1122 && (ln == 1 || !(OP(c) == EXACTF
1123 ? ibcmp(s, m, ln)
1124 : ibcmp_locale(s, m, ln)))
1125 && (norun || regtry(prog, s)) )
1126 goto got_it;
1127 s++;
1128 }
b3c9acc1
IZ
1129 }
1130 break;
bbce6d69 1131 case BOUNDL:
3280af22 1132 PL_reg_flags |= RF_tainted;
bbce6d69 1133 /* FALL THROUGH */
a0d0e21e 1134 case BOUND:
ffc61ed2 1135 if (do_utf8) {
12d33761 1136 if (s == PL_bostr)
ffc61ed2
JH
1137 tmp = '\n';
1138 else {
b4f7163a 1139 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1140
b4f7163a 1141 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1142 }
1143 tmp = ((OP(c) == BOUND ?
9041c2e3 1144 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1145 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1146 while (s < strend) {
1147 if (tmp == !(OP(c) == BOUND ?
3568d838 1148 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1149 isALNUM_LC_utf8((U8*)s)))
1150 {
1151 tmp = !tmp;
1152 if ((norun || regtry(prog, s)))
1153 goto got_it;
1154 }
1155 s += UTF8SKIP(s);
a687059c 1156 }
a0d0e21e 1157 }
667bb95a 1158 else {
12d33761 1159 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1160 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1161 while (s < strend) {
1162 if (tmp ==
1163 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1164 tmp = !tmp;
1165 if ((norun || regtry(prog, s)))
1166 goto got_it;
1167 }
1168 s++;
a0ed51b3 1169 }
a0ed51b3 1170 }
6eb5f6b9 1171 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1172 goto got_it;
1173 break;
bbce6d69 1174 case NBOUNDL:
3280af22 1175 PL_reg_flags |= RF_tainted;
bbce6d69 1176 /* FALL THROUGH */
a0d0e21e 1177 case NBOUND:
ffc61ed2 1178 if (do_utf8) {
12d33761 1179 if (s == PL_bostr)
ffc61ed2
JH
1180 tmp = '\n';
1181 else {
b4f7163a 1182 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1183
b4f7163a 1184 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1185 }
1186 tmp = ((OP(c) == NBOUND ?
9041c2e3 1187 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1188 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1189 while (s < strend) {
1190 if (tmp == !(OP(c) == NBOUND ?
3568d838 1191 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1192 isALNUM_LC_utf8((U8*)s)))
1193 tmp = !tmp;
1194 else if ((norun || regtry(prog, s)))
1195 goto got_it;
1196 s += UTF8SKIP(s);
1197 }
a0d0e21e 1198 }
667bb95a 1199 else {
12d33761 1200 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1201 tmp = ((OP(c) == NBOUND ?
1202 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1203 while (s < strend) {
1204 if (tmp ==
1205 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1206 tmp = !tmp;
1207 else if ((norun || regtry(prog, s)))
1208 goto got_it;
1209 s++;
1210 }
a0ed51b3 1211 }
6eb5f6b9 1212 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1213 goto got_it;
1214 break;
a0d0e21e 1215 case ALNUM:
ffc61ed2 1216 if (do_utf8) {
8269fa76 1217 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1218 while (s < strend) {
3568d838 1219 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1220 if (tmp && (norun || regtry(prog, s)))
1221 goto got_it;
1222 else
1223 tmp = doevery;
1224 }
bbce6d69 1225 else
ffc61ed2
JH
1226 tmp = 1;
1227 s += UTF8SKIP(s);
bbce6d69 1228 }
bbce6d69 1229 }
ffc61ed2
JH
1230 else {
1231 while (s < strend) {
1232 if (isALNUM(*s)) {
1233 if (tmp && (norun || regtry(prog, s)))
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
a0ed51b3 1238 else
ffc61ed2
JH
1239 tmp = 1;
1240 s++;
a0ed51b3 1241 }
a0ed51b3
LW
1242 }
1243 break;
bbce6d69 1244 case ALNUML:
3280af22 1245 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1246 if (do_utf8) {
1247 while (s < strend) {
1248 if (isALNUM_LC_utf8((U8*)s)) {
1249 if (tmp && (norun || regtry(prog, s)))
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
a687059c 1254 else
ffc61ed2
JH
1255 tmp = 1;
1256 s += UTF8SKIP(s);
a0d0e21e 1257 }
a0d0e21e 1258 }
ffc61ed2
JH
1259 else {
1260 while (s < strend) {
1261 if (isALNUM_LC(*s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a0ed51b3 1267 else
ffc61ed2
JH
1268 tmp = 1;
1269 s++;
a0ed51b3 1270 }
a0ed51b3
LW
1271 }
1272 break;
a0d0e21e 1273 case NALNUM:
ffc61ed2 1274 if (do_utf8) {
8269fa76 1275 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1276 while (s < strend) {
3568d838 1277 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1278 if (tmp && (norun || regtry(prog, s)))
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
bbce6d69 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s += UTF8SKIP(s);
bbce6d69 1286 }
bbce6d69 1287 }
ffc61ed2
JH
1288 else {
1289 while (s < strend) {
1290 if (!isALNUM(*s)) {
1291 if (tmp && (norun || regtry(prog, s)))
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
a0ed51b3 1296 else
ffc61ed2
JH
1297 tmp = 1;
1298 s++;
a0ed51b3 1299 }
a0ed51b3
LW
1300 }
1301 break;
bbce6d69 1302 case NALNUML:
3280af22 1303 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1304 if (do_utf8) {
1305 while (s < strend) {
1306 if (!isALNUM_LC_utf8((U8*)s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a687059c 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s += UTF8SKIP(s);
a687059c 1315 }
a0d0e21e 1316 }
ffc61ed2
JH
1317 else {
1318 while (s < strend) {
1319 if (!isALNUM_LC(*s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a0ed51b3 1325 else
ffc61ed2
JH
1326 tmp = 1;
1327 s++;
a0ed51b3 1328 }
a0ed51b3
LW
1329 }
1330 break;
a0d0e21e 1331 case SPACE:
ffc61ed2 1332 if (do_utf8) {
8269fa76 1333 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1334 while (s < strend) {
3568d838 1335 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1336 if (tmp && (norun || regtry(prog, s)))
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0d0e21e 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s += UTF8SKIP(s);
2304df62 1344 }
a0d0e21e 1345 }
ffc61ed2
JH
1346 else {
1347 while (s < strend) {
1348 if (isSPACE(*s)) {
1349 if (tmp && (norun || regtry(prog, s)))
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0ed51b3 1354 else
ffc61ed2
JH
1355 tmp = 1;
1356 s++;
a0ed51b3 1357 }
a0ed51b3
LW
1358 }
1359 break;
bbce6d69 1360 case SPACEL:
3280af22 1361 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1362 if (do_utf8) {
1363 while (s < strend) {
1364 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1365 if (tmp && (norun || regtry(prog, s)))
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
bbce6d69 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s += UTF8SKIP(s);
bbce6d69 1373 }
bbce6d69 1374 }
ffc61ed2
JH
1375 else {
1376 while (s < strend) {
1377 if (isSPACE_LC(*s)) {
1378 if (tmp && (norun || regtry(prog, s)))
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
a0ed51b3 1383 else
ffc61ed2
JH
1384 tmp = 1;
1385 s++;
a0ed51b3 1386 }
a0ed51b3
LW
1387 }
1388 break;
a0d0e21e 1389 case NSPACE:
ffc61ed2 1390 if (do_utf8) {
8269fa76 1391 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1392 while (s < strend) {
3568d838 1393 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1394 if (tmp && (norun || regtry(prog, s)))
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0d0e21e 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s += UTF8SKIP(s);
a687059c 1402 }
a0d0e21e 1403 }
ffc61ed2
JH
1404 else {
1405 while (s < strend) {
1406 if (!isSPACE(*s)) {
1407 if (tmp && (norun || regtry(prog, s)))
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0ed51b3 1412 else
ffc61ed2
JH
1413 tmp = 1;
1414 s++;
a0ed51b3 1415 }
a0ed51b3
LW
1416 }
1417 break;
bbce6d69 1418 case NSPACEL:
3280af22 1419 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1420 if (do_utf8) {
1421 while (s < strend) {
1422 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1423 if (tmp && (norun || regtry(prog, s)))
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
bbce6d69 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s += UTF8SKIP(s);
bbce6d69 1431 }
bbce6d69 1432 }
ffc61ed2
JH
1433 else {
1434 while (s < strend) {
1435 if (!isSPACE_LC(*s)) {
1436 if (tmp && (norun || regtry(prog, s)))
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
a0ed51b3 1441 else
ffc61ed2
JH
1442 tmp = 1;
1443 s++;
a0ed51b3 1444 }
a0ed51b3
LW
1445 }
1446 break;
a0d0e21e 1447 case DIGIT:
ffc61ed2 1448 if (do_utf8) {
8269fa76 1449 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1450 while (s < strend) {
3568d838 1451 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1452 if (tmp && (norun || regtry(prog, s)))
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0d0e21e 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s += UTF8SKIP(s);
2b69d0c2 1460 }
a0d0e21e 1461 }
ffc61ed2
JH
1462 else {
1463 while (s < strend) {
1464 if (isDIGIT(*s)) {
1465 if (tmp && (norun || regtry(prog, s)))
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0ed51b3 1470 else
ffc61ed2
JH
1471 tmp = 1;
1472 s++;
a0ed51b3 1473 }
a0ed51b3
LW
1474 }
1475 break;
b8c5462f
JH
1476 case DIGITL:
1477 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1478 if (do_utf8) {
1479 while (s < strend) {
1480 if (isDIGIT_LC_utf8((U8*)s)) {
1481 if (tmp && (norun || regtry(prog, s)))
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
b8c5462f 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s += UTF8SKIP(s);
b8c5462f 1489 }
b8c5462f 1490 }
ffc61ed2
JH
1491 else {
1492 while (s < strend) {
1493 if (isDIGIT_LC(*s)) {
1494 if (tmp && (norun || regtry(prog, s)))
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2
JH
1500 tmp = 1;
1501 s++;
b8c5462f 1502 }
b8c5462f
JH
1503 }
1504 break;
a0d0e21e 1505 case NDIGIT:
ffc61ed2 1506 if (do_utf8) {
8269fa76 1507 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1508 while (s < strend) {
3568d838 1509 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1510 if (tmp && (norun || regtry(prog, s)))
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
a0d0e21e 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s += UTF8SKIP(s);
a687059c 1518 }
a0d0e21e 1519 }
ffc61ed2
JH
1520 else {
1521 while (s < strend) {
1522 if (!isDIGIT(*s)) {
1523 if (tmp && (norun || regtry(prog, s)))
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0ed51b3 1528 else
ffc61ed2
JH
1529 tmp = 1;
1530 s++;
a0ed51b3 1531 }
a0ed51b3
LW
1532 }
1533 break;
b8c5462f
JH
1534 case NDIGITL:
1535 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1536 if (do_utf8) {
1537 while (s < strend) {
1538 if (!isDIGIT_LC_utf8((U8*)s)) {
1539 if (tmp && (norun || regtry(prog, s)))
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
b8c5462f 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s += UTF8SKIP(s);
b8c5462f 1547 }
a0ed51b3 1548 }
ffc61ed2
JH
1549 else {
1550 while (s < strend) {
1551 if (!isDIGIT_LC(*s)) {
1552 if (tmp && (norun || regtry(prog, s)))
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
cf93c79d 1557 else
ffc61ed2
JH
1558 tmp = 1;
1559 s++;
b8c5462f 1560 }
b8c5462f
JH
1561 }
1562 break;
b3c9acc1 1563 default:
3c3eec57
GS
1564 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1565 break;
d6a28714 1566 }
6eb5f6b9
JH
1567 return 0;
1568 got_it:
1569 return s;
1570}
1571
1572/*
1573 - regexec_flags - match a regexp against a string
1574 */
1575I32
1576Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1577 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1578/* strend: pointer to null at end of string */
1579/* strbeg: real beginning of string */
1580/* minend: end of match must be >=minend after stringarg. */
1581/* data: May be used for some additional optimizations. */
1582/* nosave: For optimizations. */
1583{
6eb5f6b9
JH
1584 register char *s;
1585 register regnode *c;
1586 register char *startpos = stringarg;
6eb5f6b9
JH
1587 I32 minlen; /* must match at least this many chars */
1588 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1589 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1590 constant substr. */ /* CC */
1591 I32 end_shift = 0; /* Same for the end. */ /* CC */
1592 I32 scream_pos = -1; /* Internal iterator of scream. */
1593 char *scream_olds;
1594 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1595 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1596#ifdef DEBUGGING
9e55ce06
JH
1597 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1598 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1599#endif
6eb5f6b9
JH
1600
1601 PL_regcc = 0;
1602
1603 cache_re(prog);
1604#ifdef DEBUGGING
aea4f609 1605 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1606#endif
1607
1608 /* Be paranoid... */
1609 if (prog == NULL || startpos == NULL) {
1610 Perl_croak(aTHX_ "NULL regexp parameter");
1611 return 0;
1612 }
1613
1614 minlen = prog->minlen;
61a36c01 1615 if (strend - startpos < minlen) {
a72c7584
JH
1616 DEBUG_r(PerlIO_printf(Perl_debug_log,
1617 "String too short [regexec_flags]...\n"));
1618 goto phooey;
1aa99e6b 1619 }
6eb5f6b9 1620
6eb5f6b9
JH
1621 /* Check validity of program. */
1622 if (UCHARAT(prog->program) != REG_MAGIC) {
1623 Perl_croak(aTHX_ "corrupted regexp program");
1624 }
1625
1626 PL_reg_flags = 0;
1627 PL_reg_eval_set = 0;
1628 PL_reg_maxiter = 0;
1629
1630 if (prog->reganch & ROPT_UTF8)
1631 PL_reg_flags |= RF_utf8;
1632
1633 /* Mark beginning of line for ^ and lookbehind. */
1634 PL_regbol = startpos;
1635 PL_bostr = strbeg;
1636 PL_reg_sv = sv;
1637
1638 /* Mark end of line for $ (and such) */
1639 PL_regeol = strend;
1640
1641 /* see how far we have to get to not match where we matched before */
1642 PL_regtill = startpos+minend;
1643
1644 /* We start without call_cc context. */
1645 PL_reg_call_cc = 0;
1646
1647 /* If there is a "must appear" string, look for it. */
1648 s = startpos;
1649
1650 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1651 MAGIC *mg;
1652
1653 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1654 PL_reg_ganch = startpos;
1655 else if (sv && SvTYPE(sv) >= SVt_PVMG
1656 && SvMAGIC(sv)
14befaf4
DM
1657 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1658 && mg->mg_len >= 0) {
6eb5f6b9
JH
1659 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1660 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1661 if (s > PL_reg_ganch)
6eb5f6b9
JH
1662 goto phooey;
1663 s = PL_reg_ganch;
1664 }
1665 }
1666 else /* pos() not defined */
1667 PL_reg_ganch = strbeg;
1668 }
1669
33b8afdf 1670 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1671 re_scream_pos_data d;
1672
1673 d.scream_olds = &scream_olds;
1674 d.scream_pos = &scream_pos;
1675 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1676 if (!s) {
1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1678 goto phooey; /* not present */
3fa9c3d7 1679 }
6eb5f6b9
JH
1680 }
1681
2a782b5b 1682 DEBUG_r({
9e55ce06
JH
1683 char *s0 = UTF ?
1684 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1685 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1686 prog->precomp;
1687 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1688 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1689 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1690 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1691 if (!PL_colorset)
1692 reginitcolors();
1693 PerlIO_printf(Perl_debug_log,
9e55ce06 1694 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1695 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1696 len0, len0, s0,
2a782b5b 1697 PL_colors[1],
9e55ce06 1698 len0 > 60 ? "..." : "",
2a782b5b 1699 PL_colors[0],
9e55ce06
JH
1700 (int)(len1 > 60 ? 60 : len1),
1701 s1, PL_colors[1],
1702 (len1 > 60 ? "..." : "")
2a782b5b
JH
1703 );
1704 });
6eb5f6b9
JH
1705
1706 /* Simplest case: anchored match need be tried only once. */
1707 /* [unless only anchor is BOL and multiline is set] */
1708 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1709 if (s == startpos && regtry(prog, startpos))
1710 goto got_it;
1711 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1712 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1713 {
1714 char *end;
1715
1716 if (minlen)
1717 dontbother = minlen - 1;
1aa99e6b 1718 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1719 /* for multiline we only have to try after newlines */
33b8afdf 1720 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1721 if (s == startpos)
1722 goto after_try;
1723 while (1) {
1724 if (regtry(prog, s))
1725 goto got_it;
1726 after_try:
1727 if (s >= end)
1728 goto phooey;
1729 if (prog->reganch & RE_USE_INTUIT) {
1730 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1731 if (!s)
1732 goto phooey;
1733 }
1734 else
1735 s++;
1736 }
1737 } else {
1738 if (s > startpos)
1739 s--;
1740 while (s < end) {
1741 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1742 if (regtry(prog, s))
1743 goto got_it;
1744 }
1745 }
1746 }
1747 }
1748 goto phooey;
1749 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1750 if (regtry(prog, PL_reg_ganch))
1751 goto got_it;
1752 goto phooey;
1753 }
1754
1755 /* Messy cases: unanchored match. */
33b8afdf 1756 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1757 /* we have /x+whatever/ */
1758 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1759 char ch;
bf93d4cc
GS
1760#ifdef DEBUGGING
1761 int did_match = 0;
1762#endif
33b8afdf
JH
1763 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1764 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1765 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1766
1aa99e6b 1767 if (do_utf8) {
6eb5f6b9
JH
1768 while (s < strend) {
1769 if (*s == ch) {
bf93d4cc 1770 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1771 if (regtry(prog, s)) goto got_it;
1772 s += UTF8SKIP(s);
1773 while (s < strend && *s == ch)
1774 s += UTF8SKIP(s);
1775 }
1776 s += UTF8SKIP(s);
1777 }
1778 }
1779 else {
1780 while (s < strend) {
1781 if (*s == ch) {
bf93d4cc 1782 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1783 if (regtry(prog, s)) goto got_it;
1784 s++;
1785 while (s < strend && *s == ch)
1786 s++;
1787 }
1788 s++;
1789 }
1790 }
b7953727 1791 DEBUG_r(if (!did_match)
bf93d4cc 1792 PerlIO_printf(Perl_debug_log,
b7953727
JH
1793 "Did not find anchored character...\n")
1794 );
6eb5f6b9
JH
1795 }
1796 /*SUPPRESS 560*/
33b8afdf
JH
1797 else if (prog->anchored_substr != Nullsv
1798 || prog->anchored_utf8 != Nullsv
1799 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1800 && prog->float_max_offset < strend - s)) {
1801 SV *must;
1802 I32 back_max;
1803 I32 back_min;
1804 char *last;
6eb5f6b9 1805 char *last1; /* Last position checked before */
bf93d4cc
GS
1806#ifdef DEBUGGING
1807 int did_match = 0;
1808#endif
33b8afdf
JH
1809 if (prog->anchored_substr || prog->anchored_utf8) {
1810 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1811 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1812 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1813 back_max = back_min = prog->anchored_offset;
1814 } else {
1815 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1816 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1817 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1818 back_max = prog->float_max_offset;
1819 back_min = prog->float_min_offset;
1820 }
1821 if (must == &PL_sv_undef)
1822 /* could not downgrade utf8 check substring, so must fail */
1823 goto phooey;
1824
1825 last = HOP3c(strend, /* Cannot start after this */
1826 -(I32)(CHR_SVLEN(must)
1827 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1828
1829 if (s > PL_bostr)
1830 last1 = HOPc(s, -1);
1831 else
1832 last1 = s - 1; /* bogus */
1833
1834 /* XXXX check_substr already used to find `s', can optimize if
1835 check_substr==must. */
1836 scream_pos = -1;
1837 dontbother = end_shift;
1838 strend = HOPc(strend, -dontbother);
1839 while ( (s <= last) &&
9041c2e3 1840 ((flags & REXEC_SCREAM)
1aa99e6b 1841 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1842 end_shift, &scream_pos, 0))
1aa99e6b 1843 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1844 (unsigned char*)strend, must,
6eb5f6b9 1845 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
bf93d4cc 1846 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1847 if (HOPc(s, -back_max) > last1) {
1848 last1 = HOPc(s, -back_min);
1849 s = HOPc(s, -back_max);
1850 }
1851 else {
1852 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1853
1854 last1 = HOPc(s, -back_min);
1855 s = t;
1856 }
1aa99e6b 1857 if (do_utf8) {
6eb5f6b9
JH
1858 while (s <= last1) {
1859 if (regtry(prog, s))
1860 goto got_it;
1861 s += UTF8SKIP(s);
1862 }
1863 }
1864 else {
1865 while (s <= last1) {
1866 if (regtry(prog, s))
1867 goto got_it;
1868 s++;
1869 }
1870 }
1871 }
b7953727
JH
1872 DEBUG_r(if (!did_match)
1873 PerlIO_printf(Perl_debug_log,
1874 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1875 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1876 ? "anchored" : "floating"),
1877 PL_colors[0],
1878 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1879 SvPVX(must),
b7953727
JH
1880 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1881 );
6eb5f6b9
JH
1882 goto phooey;
1883 }
155aba94 1884 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1885 if (minlen) {
1886 I32 op = (U8)OP(prog->regstclass);
66e933ab 1887 /* don't bother with what can't match */
f14c76ed
RGS
1888 if (PL_regkind[op] != EXACT && op != CANY)
1889 strend = HOPc(strend, -(minlen - 1));
1890 }
ffc61ed2
JH
1891 DEBUG_r({
1892 SV *prop = sv_newmortal();
9e55ce06
JH
1893 char *s0;
1894 char *s1;
1895 int len0;
1896 int len1;
1897
ffc61ed2 1898 regprop(prop, c);
9e55ce06
JH
1899 s0 = UTF ?
1900 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1901 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1902 SvPVX(prop);
1903 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1904 s1 = UTF ?
c728cb41 1905 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1906 len1 = UTF ? SvCUR(dsv1) : strend - s;
1907 PerlIO_printf(Perl_debug_log,
1908 "Matching stclass `%*.*s' against `%*.*s'\n",
1909 len0, len0, s0,
1910 len1, len1, s1);
ffc61ed2 1911 });
6eb5f6b9
JH
1912 if (find_byclass(prog, c, s, strend, startpos, 0))
1913 goto got_it;
bf93d4cc 1914 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1915 }
1916 else {
1917 dontbother = 0;
33b8afdf
JH
1918 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1919 /* Trim the end. */
d6a28714 1920 char *last;
33b8afdf
JH
1921 SV* float_real;
1922
1923 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1924 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1925 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1926
1927 if (flags & REXEC_SCREAM) {
33b8afdf 1928 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1929 end_shift, &scream_pos, 1); /* last one */
1930 if (!last)
ffc61ed2 1931 last = scream_olds; /* Only one occurrence. */
b8c5462f 1932 }
d6a28714
JH
1933 else {
1934 STRLEN len;
33b8afdf 1935 char *little = SvPV(float_real, len);
d6a28714 1936
33b8afdf 1937 if (SvTAIL(float_real)) {
d6a28714
JH
1938 if (memEQ(strend - len + 1, little, len - 1))
1939 last = strend - len + 1;
1940 else if (!PL_multiline)
9041c2e3 1941 last = memEQ(strend - len, little, len)
d6a28714 1942 ? strend - len : Nullch;
b8c5462f 1943 else
d6a28714
JH
1944 goto find_last;
1945 } else {
1946 find_last:
9041c2e3 1947 if (len)
d6a28714 1948 last = rninstr(s, strend, little, little + len);
b8c5462f 1949 else
d6a28714 1950 last = strend; /* matching `$' */
b8c5462f 1951 }
b8c5462f 1952 }
bf93d4cc
GS
1953 if (last == NULL) {
1954 DEBUG_r(PerlIO_printf(Perl_debug_log,
1955 "%sCan't trim the tail, match fails (should not happen)%s\n",
1956 PL_colors[4],PL_colors[5]));
1957 goto phooey; /* Should not happen! */
1958 }
d6a28714
JH
1959 dontbother = strend - last + prog->float_min_offset;
1960 }
1961 if (minlen && (dontbother < minlen))
1962 dontbother = minlen - 1;
1963 strend -= dontbother; /* this one's always in bytes! */
1964 /* We don't know much -- general case. */
1aa99e6b 1965 if (do_utf8) {
d6a28714
JH
1966 for (;;) {
1967 if (regtry(prog, s))
1968 goto got_it;
1969 if (s >= strend)
1970 break;
b8c5462f 1971 s += UTF8SKIP(s);
d6a28714
JH
1972 };
1973 }
1974 else {
1975 do {
1976 if (regtry(prog, s))
1977 goto got_it;
1978 } while (s++ < strend);
1979 }
1980 }
1981
1982 /* Failure. */
1983 goto phooey;
1984
1985got_it:
1986 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1987
1988 if (PL_reg_eval_set) {
1989 /* Preserve the current value of $^R */
1990 if (oreplsv != GvSV(PL_replgv))
1991 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1992 restored, the value remains
1993 the same. */
acfe0abc 1994 restore_pos(aTHX_ 0);
d6a28714
JH
1995 }
1996
1997 /* make sure $`, $&, $', and $digit will work later */
1998 if ( !(flags & REXEC_NOT_FIRST) ) {
1999 if (RX_MATCH_COPIED(prog)) {
2000 Safefree(prog->subbeg);
2001 RX_MATCH_COPIED_off(prog);
2002 }
2003 if (flags & REXEC_COPY_STR) {
2004 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2005
2006 s = savepvn(strbeg, i);
2007 prog->subbeg = s;
2008 prog->sublen = i;
2009 RX_MATCH_COPIED_on(prog);
2010 }
2011 else {
2012 prog->subbeg = strbeg;
2013 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2014 }
2015 }
9041c2e3 2016
d6a28714
JH
2017 return 1;
2018
2019phooey:
bf93d4cc
GS
2020 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2021 PL_colors[4],PL_colors[5]));
d6a28714 2022 if (PL_reg_eval_set)
acfe0abc 2023 restore_pos(aTHX_ 0);
d6a28714
JH
2024 return 0;
2025}
2026
2027/*
2028 - regtry - try match at specific point
2029 */
2030STATIC I32 /* 0 failure, 1 success */
2031S_regtry(pTHX_ regexp *prog, char *startpos)
2032{
d6a28714
JH
2033 register I32 i;
2034 register I32 *sp;
2035 register I32 *ep;
2036 CHECKPOINT lastcp;
2037
02db2b7b
IZ
2038#ifdef DEBUGGING
2039 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2040#endif
d6a28714
JH
2041 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2042 MAGIC *mg;
2043
2044 PL_reg_eval_set = RS_init;
2045 DEBUG_r(DEBUG_s(
b900a521
JH
2046 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2047 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2048 ));
e8347627 2049 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2050 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2051 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2052 SAVETMPS;
2053 /* Apparently this is not needed, judging by wantarray. */
e8347627 2054 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2055 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2056
2057 if (PL_reg_sv) {
2058 /* Make $_ available to executed code. */
2059 if (PL_reg_sv != DEFSV) {
4d1ff10f 2060 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2061 SAVESPTR(DEFSV);
2062 DEFSV = PL_reg_sv;
b8c5462f 2063 }
d6a28714 2064
9041c2e3 2065 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2066 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2067 /* prepare for quick setting of pos */
14befaf4
DM
2068 sv_magic(PL_reg_sv, (SV*)0,
2069 PERL_MAGIC_regex_global, Nullch, 0);
2070 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2071 mg->mg_len = -1;
b8c5462f 2072 }
d6a28714
JH
2073 PL_reg_magic = mg;
2074 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2075 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2076 }
09687e5a 2077 if (!PL_reg_curpm) {
0f79a09d 2078 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2079#ifdef USE_ITHREADS
2080 {
2081 SV* repointer = newSViv(0);
577e12cc 2082 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2083 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2084 av_push(PL_regex_padav,repointer);
2085 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2086 PL_regex_pad = AvARRAY(PL_regex_padav);
2087 }
2088#endif
2089 }
aaa362c4 2090 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2091 PL_reg_oldcurpm = PL_curpm;
2092 PL_curpm = PL_reg_curpm;
2093 if (RX_MATCH_COPIED(prog)) {
2094 /* Here is a serious problem: we cannot rewrite subbeg,
2095 since it may be needed if this match fails. Thus
2096 $` inside (?{}) could fail... */
2097 PL_reg_oldsaved = prog->subbeg;
2098 PL_reg_oldsavedlen = prog->sublen;
2099 RX_MATCH_COPIED_off(prog);
2100 }
2101 else
2102 PL_reg_oldsaved = Nullch;
2103 prog->subbeg = PL_bostr;
2104 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2105 }
2106 prog->startp[0] = startpos - PL_bostr;
2107 PL_reginput = startpos;
2108 PL_regstartp = prog->startp;
2109 PL_regendp = prog->endp;
2110 PL_reglastparen = &prog->lastparen;
a01268b5 2111 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714
JH
2112 prog->lastparen = 0;
2113 PL_regsize = 0;
2114 DEBUG_r(PL_reg_starttry = startpos);
2115 if (PL_reg_start_tmpl <= prog->nparens) {
2116 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2117 if(PL_reg_start_tmp)
2118 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2119 else
2120 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2121 }
2122
2123 /* XXXX What this code is doing here?!!! There should be no need
2124 to do this again and again, PL_reglastparen should take care of
3dd2943c 2125 this! --ilya*/
dafc8851
JH
2126
2127 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2128 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2129 * PL_reglastparen), is not needed at all by the test suite
2130 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2131 * enough, for building DynaLoader, or otherwise this
2132 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2133 * will happen. Meanwhile, this code *is* needed for the
2134 * above-mentioned test suite tests to succeed. The common theme
2135 * on those tests seems to be returning null fields from matches.
2136 * --jhi */
dafc8851 2137#if 1
d6a28714
JH
2138 sp = prog->startp;
2139 ep = prog->endp;
2140 if (prog->nparens) {
eb160463 2141 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2142 *++sp = -1;
2143 *++ep = -1;
2144 }
2145 }
dafc8851 2146#endif
02db2b7b 2147 REGCP_SET(lastcp);
d6a28714
JH
2148 if (regmatch(prog->program + 1)) {
2149 prog->endp[0] = PL_reginput - PL_bostr;
2150 return 1;
2151 }
02db2b7b 2152 REGCP_UNWIND(lastcp);
d6a28714
JH
2153 return 0;
2154}
2155
02db2b7b
IZ
2156#define RE_UNWIND_BRANCH 1
2157#define RE_UNWIND_BRANCHJ 2
2158
2159union re_unwind_t;
2160
2161typedef struct { /* XX: makes sense to enlarge it... */
2162 I32 type;
2163 I32 prev;
2164 CHECKPOINT lastcp;
2165} re_unwind_generic_t;
2166
2167typedef struct {
2168 I32 type;
2169 I32 prev;
2170 CHECKPOINT lastcp;
2171 I32 lastparen;
2172 regnode *next;
2173 char *locinput;
2174 I32 nextchr;
2175#ifdef DEBUGGING
2176 int regindent;
2177#endif
2178} re_unwind_branch_t;
2179
2180typedef union re_unwind_t {
2181 I32 type;
2182 re_unwind_generic_t generic;
2183 re_unwind_branch_t branch;
2184} re_unwind_t;
2185
8ba1375e
MJD
2186#define sayYES goto yes
2187#define sayNO goto no
e0f9d4a8 2188#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2189#define sayYES_FINAL goto yes_final
2190#define sayYES_LOUD goto yes_loud
2191#define sayNO_FINAL goto no_final
2192#define sayNO_SILENT goto do_no
2193#define saySAME(x) if (x) goto yes; else goto no
2194
2195#define REPORT_CODE_OFF 24
2196
d6a28714
JH
2197/*
2198 - regmatch - main matching routine
2199 *
2200 * Conceptually the strategy is simple: check to see whether the current
2201 * node matches, call self recursively to see whether the rest matches,
2202 * and then act accordingly. In practice we make some effort to avoid
2203 * recursion, in particular by going through "ordinary" nodes (that don't
2204 * need to know whether the rest of the match failed) by a loop instead of
2205 * by recursion.
2206 */
2207/* [lwall] I've hoisted the register declarations to the outer block in order to
2208 * maybe save a little bit of pushing and popping on the stack. It also takes
2209 * advantage of machines that use a register save mask on subroutine entry.
2210 */
2211STATIC I32 /* 0 failure, 1 success */
2212S_regmatch(pTHX_ regnode *prog)
2213{
d6a28714
JH
2214 register regnode *scan; /* Current node. */
2215 regnode *next; /* Next node. */
2216 regnode *inner; /* Next node in internal branch. */
2217 register I32 nextchr; /* renamed nextchr - nextchar colides with
2218 function of same name */
2219 register I32 n; /* no or next */
b7953727
JH
2220 register I32 ln = 0; /* len or last */
2221 register char *s = Nullch; /* operand or save */
d6a28714 2222 register char *locinput = PL_reginput;
b7953727 2223 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2224 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2225 I32 unwind = 0;
b7953727 2226#if 0
02db2b7b 2227 I32 firstcp = PL_savestack_ix;
b7953727 2228#endif
53c4c00c 2229 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2230#ifdef DEBUGGING
ce333219
JH
2231 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2232 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2233 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2234#endif
02db2b7b 2235
d6a28714
JH
2236#ifdef DEBUGGING
2237 PL_regindent++;
2238#endif
2239
2240 /* Note that nextchr is a byte even in UTF */
2241 nextchr = UCHARAT(locinput);
2242 scan = prog;
2243 while (scan != NULL) {
8ba1375e 2244
2a782b5b 2245 DEBUG_r( {
d6a28714
JH
2246 SV *prop = sv_newmortal();
2247 int docolor = *PL_colors[0];
2248 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2249 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2250 /* The part of the string before starttry has one color
2251 (pref0_len chars), between starttry and current
2252 position another one (pref_len - pref0_len chars),
2253 after the current position the third one.
2254 We assume that pref0_len <= pref_len, otherwise we
2255 decrease pref0_len. */
9041c2e3 2256 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2257 ? (5 + taill) - l : locinput - PL_bostr;
2258 int pref0_len;
d6a28714 2259
df1ffd02 2260 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2261 pref_len++;
2262 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2263 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2264 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2265 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2266 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2267 l--;
d6a28714
JH
2268 if (pref0_len < 0)
2269 pref0_len = 0;
2270 if (pref0_len > pref_len)
2271 pref0_len = pref_len;
2272 regprop(prop, scan);
2a782b5b
JH
2273 {
2274 char *s0 =
f14c76ed 2275 do_utf8 && OP(scan) != CANY ?
2a782b5b 2276 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2277 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2278 locinput - pref_len;
df1ffd02 2279 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2280 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2281 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2282 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2283 locinput - pref_len + pref0_len;
df1ffd02 2284 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2285 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2286 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2287 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2288 locinput;
df1ffd02 2289 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2290 PerlIO_printf(Perl_debug_log,
2291 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2292 (IV)(locinput - PL_bostr),
2293 PL_colors[4],
2294 len0, s0,
2295 PL_colors[5],
2296 PL_colors[2],
2297 len1, s1,
2298 PL_colors[3],
2299 (docolor ? "" : "> <"),
2300 PL_colors[0],
2301 len2, s2,
2302 PL_colors[1],
2303 15 - l - pref_len + 1,
2304 "",
2305 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2306 SvPVX(prop));
2307 }
2308 });
d6a28714
JH
2309
2310 next = scan + NEXT_OFF(scan);
2311 if (next == scan)
2312 next = NULL;
2313
2314 switch (OP(scan)) {
2315 case BOL:
12d33761
HS
2316 if (locinput == PL_bostr || (PL_multiline &&
2317 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2318 {
2319 /* regtill = regbol; */
b8c5462f
JH
2320 break;
2321 }
d6a28714
JH
2322 sayNO;
2323 case MBOL:
12d33761
HS
2324 if (locinput == PL_bostr ||
2325 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2326 {
b8c5462f
JH
2327 break;
2328 }
d6a28714
JH
2329 sayNO;
2330 case SBOL:
c2a73568 2331 if (locinput == PL_bostr)
b8c5462f 2332 break;
d6a28714
JH
2333 sayNO;
2334 case GPOS:
2335 if (locinput == PL_reg_ganch)
2336 break;
2337 sayNO;
2338 case EOL:
2339 if (PL_multiline)
2340 goto meol;
2341 else
2342 goto seol;
2343 case MEOL:
2344 meol:
2345 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2346 sayNO;
b8c5462f 2347 break;
d6a28714
JH
2348 case SEOL:
2349 seol:
2350 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2351 sayNO;
d6a28714 2352 if (PL_regeol - locinput > 1)
b8c5462f 2353 sayNO;
b8c5462f 2354 break;
d6a28714
JH
2355 case EOS:
2356 if (PL_regeol != locinput)
b8c5462f 2357 sayNO;
d6a28714 2358 break;
ffc61ed2 2359 case SANY:
d6a28714 2360 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2361 sayNO;
f33976b4
DB
2362 if (do_utf8) {
2363 locinput += PL_utf8skip[nextchr];
2364 if (locinput > PL_regeol)
2365 sayNO;
2366 nextchr = UCHARAT(locinput);
2367 }
2368 else
2369 nextchr = UCHARAT(++locinput);
2370 break;
2371 case CANY:
2372 if (!nextchr && locinput >= PL_regeol)
2373 sayNO;
b8c5462f 2374 nextchr = UCHARAT(++locinput);
a0d0e21e 2375 break;
ffc61ed2 2376 case REG_ANY:
1aa99e6b
IH
2377 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2378 sayNO;
2379 if (do_utf8) {
b8c5462f 2380 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2381 if (locinput > PL_regeol)
2382 sayNO;
a0ed51b3 2383 nextchr = UCHARAT(locinput);
a0ed51b3 2384 }
1aa99e6b
IH
2385 else
2386 nextchr = UCHARAT(++locinput);
a0ed51b3 2387 break;
d6a28714 2388 case EXACT:
cd439c50
IZ
2389 s = STRING(scan);
2390 ln = STR_LEN(scan);
eb160463 2391 if (do_utf8 != UTF) {
bc517b45 2392 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2393 char *l = locinput;
2394 char *e = s + ln;
bc517b45 2395 STRLEN ulen;
a72c7584 2396
5ff6fc6d
JH
2397 if (do_utf8) {
2398 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2399 while (s < e) {
2400 if (l >= PL_regeol)
5ff6fc6d
JH
2401 sayNO;
2402 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2403 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2404 ckWARN(WARN_UTF8) ?
2405 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2406 sayNO;
bc517b45 2407 l += ulen;
5ff6fc6d 2408 s ++;
1aa99e6b 2409 }
5ff6fc6d
JH
2410 }
2411 else {
2412 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2413 while (s < e) {
2414 if (l >= PL_regeol)
2415 sayNO;
5ff6fc6d 2416 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2417 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2418 ckWARN(WARN_UTF8) ?
2419 0 : UTF8_ALLOW_ANY))
1aa99e6b 2420 sayNO;
bc517b45 2421 s += ulen;
a72c7584 2422 l ++;
1aa99e6b 2423 }
5ff6fc6d 2424 }
1aa99e6b
IH
2425 locinput = l;
2426 nextchr = UCHARAT(locinput);
2427 break;
2428 }
bc517b45 2429 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2430 /* Inline the first character, for speed. */
2431 if (UCHARAT(s) != nextchr)
2432 sayNO;
2433 if (PL_regeol - locinput < ln)
2434 sayNO;
2435 if (ln > 1 && memNE(s, locinput, ln))
2436 sayNO;
2437 locinput += ln;
2438 nextchr = UCHARAT(locinput);
2439 break;
2440 case EXACTFL:
b8c5462f
JH
2441 PL_reg_flags |= RF_tainted;
2442 /* FALL THROUGH */
d6a28714 2443 case EXACTF:
cd439c50
IZ
2444 s = STRING(scan);
2445 ln = STR_LEN(scan);
d6a28714 2446
d07ddd77
JH
2447 if (do_utf8 || UTF) {
2448 /* Either target or the pattern are utf8. */
d6a28714 2449 char *l = locinput;
d07ddd77 2450 char *e = PL_regeol;
bc517b45 2451
eb160463 2452 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2453 l, &e, 0, do_utf8)) {
5486206c
JH
2454 /* One more case for the sharp s:
2455 * pack("U0U*", 0xDF) =~ /ss/i,
2456 * the 0xC3 0x9F are the UTF-8
2457 * byte sequence for the U+00DF. */
2458 if (!(do_utf8 &&
2459 toLOWER(s[0]) == 's' &&
2460 ln >= 2 &&
2461 toLOWER(s[1]) == 's' &&
2462 (U8)l[0] == 0xC3 &&
2463 e - l >= 2 &&
2464 (U8)l[1] == 0x9F))
2465 sayNO;
2466 }
d07ddd77
JH
2467 locinput = e;
2468 nextchr = UCHARAT(locinput);
2469 break;
a0ed51b3 2470 }
d6a28714 2471
bc517b45
JH
2472 /* Neither the target and the pattern are utf8. */
2473
d6a28714
JH
2474 /* Inline the first character, for speed. */
2475 if (UCHARAT(s) != nextchr &&
2476 UCHARAT(s) != ((OP(scan) == EXACTF)
2477 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2478 sayNO;
d6a28714 2479 if (PL_regeol - locinput < ln)
b8c5462f 2480 sayNO;
d6a28714
JH
2481 if (ln > 1 && (OP(scan) == EXACTF
2482 ? ibcmp(s, locinput, ln)
2483 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2484 sayNO;
d6a28714
JH
2485 locinput += ln;
2486 nextchr = UCHARAT(locinput);
a0d0e21e 2487 break;
d6a28714 2488 case ANYOF:
ffc61ed2 2489 if (do_utf8) {
9e55ce06
JH
2490 STRLEN inclasslen = PL_regeol - locinput;
2491
ba7b4546 2492 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2493 sayNO_ANYOF;
ffc61ed2
JH
2494 if (locinput >= PL_regeol)
2495 sayNO;
0f0076b4 2496 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2497 nextchr = UCHARAT(locinput);
e0f9d4a8 2498 break;
ffc61ed2
JH
2499 }
2500 else {
2501 if (nextchr < 0)
2502 nextchr = UCHARAT(locinput);
7d3e948e 2503 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2504 sayNO_ANYOF;
ffc61ed2
JH
2505 if (!nextchr && locinput >= PL_regeol)
2506 sayNO;
2507 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2508 break;
2509 }
2510 no_anyof:
2511 /* If we might have the case of the German sharp s
2512 * in a casefolding Unicode character class. */
2513
ebc501f0
JH
2514 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2515 locinput += SHARP_S_SKIP;
e0f9d4a8 2516 nextchr = UCHARAT(locinput);
ffc61ed2 2517 }
e0f9d4a8
JH
2518 else
2519 sayNO;
b8c5462f 2520 break;
d6a28714 2521 case ALNUML:
b8c5462f
JH
2522 PL_reg_flags |= RF_tainted;
2523 /* FALL THROUGH */
d6a28714 2524 case ALNUM:
b8c5462f 2525 if (!nextchr)
4633a7c4 2526 sayNO;
ffc61ed2 2527 if (do_utf8) {
ad24be35 2528 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2529 if (!(OP(scan) == ALNUM
3568d838 2530 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2531 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2532 {
2533 sayNO;
a0ed51b3 2534 }
b8c5462f 2535 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2536 nextchr = UCHARAT(locinput);
2537 break;
2538 }
ffc61ed2 2539 if (!(OP(scan) == ALNUM
d6a28714 2540 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2541 sayNO;
b8c5462f 2542 nextchr = UCHARAT(++locinput);
a0d0e21e 2543 break;
d6a28714 2544 case NALNUML:
b8c5462f
JH
2545 PL_reg_flags |= RF_tainted;
2546 /* FALL THROUGH */
d6a28714
JH
2547 case NALNUM:
2548 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2549 sayNO;
ffc61ed2 2550 if (do_utf8) {
8269fa76 2551 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2552 if (OP(scan) == NALNUM
3568d838 2553 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2554 : isALNUM_LC_utf8((U8*)locinput))
2555 {
b8c5462f 2556 sayNO;
d6a28714 2557 }
b8c5462f
JH
2558 locinput += PL_utf8skip[nextchr];
2559 nextchr = UCHARAT(locinput);
2560 break;
2561 }
ffc61ed2 2562 if (OP(scan) == NALNUM
d6a28714 2563 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2564 sayNO;
76e3520e 2565 nextchr = UCHARAT(++locinput);
a0d0e21e 2566 break;
d6a28714
JH
2567 case BOUNDL:
2568 case NBOUNDL:
3280af22 2569 PL_reg_flags |= RF_tainted;
bbce6d69 2570 /* FALL THROUGH */
d6a28714
JH
2571 case BOUND:
2572 case NBOUND:
2573 /* was last char in word? */
ffc61ed2 2574 if (do_utf8) {
12d33761
HS
2575 if (locinput == PL_bostr)
2576 ln = '\n';
ffc61ed2 2577 else {
b4f7163a 2578 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2579
b4f7163a 2580 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2581 }
2582 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2583 ln = isALNUM_uni(ln);
8269fa76 2584 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2585 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2586 }
2587 else {
9041c2e3 2588 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2589 n = isALNUM_LC_utf8((U8*)locinput);
2590 }
a0ed51b3 2591 }
d6a28714 2592 else {
12d33761
HS
2593 ln = (locinput != PL_bostr) ?
2594 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2595 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2596 ln = isALNUM(ln);
2597 n = isALNUM(nextchr);
2598 }
2599 else {
2600 ln = isALNUM_LC(ln);
2601 n = isALNUM_LC(nextchr);
2602 }
d6a28714 2603 }
ffc61ed2
JH
2604 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2605 OP(scan) == BOUNDL))
2606 sayNO;
a0ed51b3 2607 break;
d6a28714 2608 case SPACEL:
3280af22 2609 PL_reg_flags |= RF_tainted;
bbce6d69 2610 /* FALL THROUGH */
d6a28714 2611 case SPACE:
9442cb0e 2612 if (!nextchr)
4633a7c4 2613 sayNO;
1aa99e6b 2614 if (do_utf8) {
fd400ab9 2615 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2616 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2617 if (!(OP(scan) == SPACE
3568d838 2618 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2619 : isSPACE_LC_utf8((U8*)locinput)))
2620 {
2621 sayNO;
2622 }
2623 locinput += PL_utf8skip[nextchr];
2624 nextchr = UCHARAT(locinput);
2625 break;
d6a28714 2626 }
ffc61ed2
JH
2627 if (!(OP(scan) == SPACE
2628 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2629 sayNO;
2630 nextchr = UCHARAT(++locinput);
2631 }
2632 else {
2633 if (!(OP(scan) == SPACE
2634 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2635 sayNO;
2636 nextchr = UCHARAT(++locinput);
a0ed51b3 2637 }
a0ed51b3 2638 break;
d6a28714 2639 case NSPACEL:
3280af22 2640 PL_reg_flags |= RF_tainted;
bbce6d69 2641 /* FALL THROUGH */
d6a28714 2642 case NSPACE:
9442cb0e 2643 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2644 sayNO;
1aa99e6b 2645 if (do_utf8) {
8269fa76 2646 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2647 if (OP(scan) == NSPACE
3568d838 2648 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2649 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2650 {
2651 sayNO;
2652 }
2653 locinput += PL_utf8skip[nextchr];
2654 nextchr = UCHARAT(locinput);
2655 break;
a0ed51b3 2656 }
ffc61ed2 2657 if (OP(scan) == NSPACE
d6a28714 2658 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2659 sayNO;
76e3520e 2660 nextchr = UCHARAT(++locinput);
a0d0e21e 2661 break;
d6a28714 2662 case DIGITL:
a0ed51b3
LW
2663 PL_reg_flags |= RF_tainted;
2664 /* FALL THROUGH */
d6a28714 2665 case DIGIT:
9442cb0e 2666 if (!nextchr)
a0ed51b3 2667 sayNO;
1aa99e6b 2668 if (do_utf8) {
8269fa76 2669 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2670 if (!(OP(scan) == DIGIT
3568d838 2671 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2672 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2673 {
a0ed51b3 2674 sayNO;
dfe13c55 2675 }
6f06b55f 2676 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2677 nextchr = UCHARAT(locinput);
2678 break;
2679 }
ffc61ed2 2680 if (!(OP(scan) == DIGIT
9442cb0e 2681 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2682 sayNO;
2683 nextchr = UCHARAT(++locinput);
2684 break;
d6a28714 2685 case NDIGITL:
b8c5462f
JH
2686 PL_reg_flags |= RF_tainted;
2687 /* FALL THROUGH */
d6a28714 2688 case NDIGIT:
9442cb0e 2689 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2690 sayNO;
1aa99e6b 2691 if (do_utf8) {
8269fa76 2692 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2693 if (OP(scan) == NDIGIT
3568d838 2694 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2695 : isDIGIT_LC_utf8((U8*)locinput))
2696 {
a0ed51b3 2697 sayNO;
9442cb0e 2698 }
6f06b55f 2699 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2700 nextchr = UCHARAT(locinput);
2701 break;
2702 }
ffc61ed2 2703 if (OP(scan) == NDIGIT
9442cb0e 2704 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2705 sayNO;
2706 nextchr = UCHARAT(++locinput);
2707 break;
2708 case CLUMP:
b7c83a7e 2709 if (locinput >= PL_regeol)
a0ed51b3 2710 sayNO;
b7c83a7e
JH
2711 if (do_utf8) {
2712 LOAD_UTF8_CHARCLASS(mark,"~");
2713 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2714 sayNO;
2715 locinput += PL_utf8skip[nextchr];
2716 while (locinput < PL_regeol &&
2717 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2718 locinput += UTF8SKIP(locinput);
2719 if (locinput > PL_regeol)
2720 sayNO;
eb08e2da
JH
2721 }
2722 else
2723 locinput++;
a0ed51b3
LW
2724 nextchr = UCHARAT(locinput);
2725 break;
c8756f30 2726 case REFFL:
3280af22 2727 PL_reg_flags |= RF_tainted;
c8756f30 2728 /* FALL THROUGH */
c277df42 2729 case REF:
c8756f30 2730 case REFF:
c277df42 2731 n = ARG(scan); /* which paren pair */
cf93c79d 2732 ln = PL_regstartp[n];
2c2d71f5 2733 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2734 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2735 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2736 if (ln == PL_regendp[n])
a0d0e21e 2737 break;
a0ed51b3 2738
cf93c79d 2739 s = PL_bostr + ln;
1aa99e6b 2740 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2741 char *l = locinput;
cf93c79d 2742 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2743 /*
2744 * Note that we can't do the "other character" lookup trick as
2745 * in the 8-bit case (no pun intended) because in Unicode we
2746 * have to map both upper and title case to lower case.
2747 */
2748 if (OP(scan) == REFF) {
a2a2844f 2749 STRLEN ulen1, ulen2;
e7ae6809
JH
2750 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2751 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2752 while (s < e) {
2753 if (l >= PL_regeol)
2754 sayNO;
a2a2844f
JH
2755 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2756 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2757 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2758 sayNO;
a2a2844f
JH
2759 s += ulen1;
2760 l += ulen2;
a0ed51b3
LW
2761 }
2762 }
2763 locinput = l;
2764 nextchr = UCHARAT(locinput);
2765 break;
2766 }
2767
a0d0e21e 2768 /* Inline the first character, for speed. */
76e3520e 2769 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2770 (OP(scan) == REF ||
2771 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2772 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2773 sayNO;
cf93c79d 2774 ln = PL_regendp[n] - ln;
3280af22 2775 if (locinput + ln > PL_regeol)
4633a7c4 2776 sayNO;
c8756f30
AK
2777 if (ln > 1 && (OP(scan) == REF
2778 ? memNE(s, locinput, ln)
2779 : (OP(scan) == REFF
2780 ? ibcmp(s, locinput, ln)
2781 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2782 sayNO;
a0d0e21e 2783 locinput += ln;
76e3520e 2784 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2785 break;
2786
2787 case NOTHING:
c277df42 2788 case TAIL:
a0d0e21e
LW
2789 break;
2790 case BACK:
2791 break;
c277df42
IZ
2792 case EVAL:
2793 {
2794 dSP;
533c011a 2795 OP_4tree *oop = PL_op;
3280af22 2796 COP *ocurcop = PL_curcop;
f3548bdc 2797 PAD *old_comppad;
c277df42 2798 SV *ret;
9041c2e3 2799
c277df42 2800 n = ARG(scan);
533c011a 2801 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2802 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2803 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2804 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2805
8e5e9ebe
RGS
2806 {
2807 SV **before = SP;
2808 CALLRUNOPS(aTHX); /* Scalar context. */
2809 SPAGAIN;
2810 if (SP == before)
075aa684 2811 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2812 else {
2813 ret = POPs;
2814 PUTBACK;
2815 }
2816 }
2817
0f5d15d6 2818 PL_op = oop;
f3548bdc 2819 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2820 PL_curcop = ocurcop;
c277df42 2821 if (logical) {
0f5d15d6
IZ
2822 if (logical == 2) { /* Postponed subexpression. */
2823 regexp *re;
22c35a8c 2824 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2825 re_cc_state state;
0f5d15d6 2826 CHECKPOINT cp, lastcp;
cb50f42d 2827 int toggleutf;
0f5d15d6
IZ
2828
2829 if(SvROK(ret) || SvRMAGICAL(ret)) {
2830 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2831
2832 if(SvMAGICAL(sv))
14befaf4 2833 mg = mg_find(sv, PERL_MAGIC_qr);
0f5d15d6
IZ
2834 }
2835 if (mg) {
2836 re = (regexp *)mg->mg_obj;
df0003d4 2837 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2838 }
2839 else {
2840 STRLEN len;
2841 char *t = SvPV(ret, len);
2842 PMOP pm;
2843 char *oprecomp = PL_regprecomp;
2844 I32 osize = PL_regsize;
2845 I32 onpar = PL_regnpar;
2846
5fcd1c1b 2847 Zero(&pm, 1, PMOP);
cb50f42d 2848 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2849 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2850 if (!(SvFLAGS(ret)
0f5d15d6 2851 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
14befaf4
DM
2852 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2853 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2854 PL_regprecomp = oprecomp;
2855 PL_regsize = osize;
2856 PL_regnpar = onpar;
2857 }
2858 DEBUG_r(
9041c2e3 2859 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2860 "Entering embedded `%s%.60s%s%s'\n",
2861 PL_colors[0],
2862 re->precomp,
2863 PL_colors[1],
2864 (strlen(re->precomp) > 60 ? "..." : ""))
2865 );
2866 state.node = next;
2867 state.prev = PL_reg_call_cc;
2868 state.cc = PL_regcc;
2869 state.re = PL_reg_re;
2870
2ab05381 2871 PL_regcc = 0;
9041c2e3 2872
0f5d15d6 2873 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2874 REGCP_SET(lastcp);
0f5d15d6
IZ
2875 cache_re(re);
2876 state.ss = PL_savestack_ix;
2877 *PL_reglastparen = 0;
a01268b5 2878 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2879 PL_reg_call_cc = &state;
2880 PL_reginput = locinput;
cb50f42d
YST
2881 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2882 ((re->reganch & ROPT_UTF8) != 0);
2883 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2884
2885 /* XXXX This is too dramatic a measure... */
2886 PL_reg_maxiter = 0;
2887
0f5d15d6 2888 if (regmatch(re->program + 1)) {
2c914db6
IZ
2889 /* Even though we succeeded, we need to restore
2890 global variables, since we may be wrapped inside
2891 SUSPEND, thus the match may be not finished yet. */
2892
2893 /* XXXX Do this only if SUSPENDed? */
2894 PL_reg_call_cc = state.prev;
2895 PL_regcc = state.cc;
2896 PL_reg_re = state.re;
2897 cache_re(PL_reg_re);
cb50f42d 2898 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2899
2900 /* XXXX This is too dramatic a measure... */
2901 PL_reg_maxiter = 0;
2902
2903 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2904 ReREFCNT_dec(re);
2905 regcpblow(cp);
2906 sayYES;
2907 }
0f5d15d6 2908 ReREFCNT_dec(re);
02db2b7b 2909 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2910 regcppop();
2911 PL_reg_call_cc = state.prev;
2912 PL_regcc = state.cc;
2913 PL_reg_re = state.re;
d3790889 2914 cache_re(PL_reg_re);
cb50f42d 2915 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2916
2917 /* XXXX This is too dramatic a measure... */
2918 PL_reg_maxiter = 0;
2919
8e514ae6 2920 logical = 0;
0f5d15d6
IZ
2921 sayNO;
2922 }
c277df42 2923 sw = SvTRUE(ret);
0f5d15d6 2924 logical = 0;
a0ed51b3
LW
2925 }
2926 else
3280af22 2927 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2928 break;
2929 }
a0d0e21e 2930 case OPEN:
c277df42 2931 n = ARG(scan); /* which paren pair */
3280af22
NIS
2932 PL_reg_start_tmp[n] = locinput;
2933 if (n > PL_regsize)
2934 PL_regsize = n;
a0d0e21e
LW
2935 break;
2936 case CLOSE:
c277df42 2937 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2938 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2939 PL_regendp[n] = locinput - PL_bostr;
eb160463 2940 if (n > (I32)*PL_reglastparen)
3280af22 2941 *PL_reglastparen = n;
a01268b5 2942 *PL_reglastcloseparen = n;
a0d0e21e 2943 break;
c277df42
IZ
2944 case GROUPP:
2945 n = ARG(scan); /* which paren pair */
eb160463 2946 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2947 break;
2948 case IFTHEN:
2c2d71f5 2949 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2950 if (sw)
2951 next = NEXTOPER(NEXTOPER(scan));
2952 else {
2953 next = scan + ARG(scan);
2954 if (OP(next) == IFTHEN) /* Fake one. */
2955 next = NEXTOPER(NEXTOPER(next));
2956 }
2957 break;
2958 case LOGICAL:
0f5d15d6 2959 logical = scan->flags;
c277df42 2960 break;
2ab05381
IZ
2961/*******************************************************************
2962 PL_regcc contains infoblock about the innermost (...)* loop, and
2963 a pointer to the next outer infoblock.
2964
2965 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2966
2967 1) After matching X, regnode for CURLYX is processed;
2968
9041c2e3 2969 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
2970 regmatch() recursively with the starting point at WHILEM node;
2971
2972 3) Each hit of WHILEM node tries to match A and Z (in the order
2973 depending on the current iteration, min/max of {min,max} and
2974 greediness). The information about where are nodes for "A"
2975 and "Z" is read from the infoblock, as is info on how many times "A"
2976 was already matched, and greediness.
2977
2978 4) After A matches, the same WHILEM node is hit again.
2979
2980 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2981 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2982 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2983 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2984 of the external loop.
2985
2986 Currently present infoblocks form a tree with a stem formed by PL_curcc
2987 and whatever it mentions via ->next, and additional attached trees
2988 corresponding to temporarily unset infoblocks as in "5" above.
2989
9041c2e3 2990 In the following picture infoblocks for outer loop of
2ab05381
IZ
2991 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2992 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2993 infoblocks are drawn below the "reset" infoblock.
2994
2995 In fact in the picture below we do not show failed matches for Z and T
2996 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2997 more obvious *why* one needs to *temporary* unset infoblocks.]
2998
2999 Matched REx position InfoBlocks Comment
3000 (Y(A)*?Z)*?T x
3001 Y(A)*?Z)*?T x <- O
3002 Y (A)*?Z)*?T x <- O
3003 Y A)*?Z)*?T x <- O <- I
3004 YA )*?Z)*?T x <- O <- I
3005 YA A)*?Z)*?T x <- O <- I
3006 YAA )*?Z)*?T x <- O <- I
3007 YAA Z)*?T x <- O # Temporary unset I
3008 I
3009
3010 YAAZ Y(A)*?Z)*?T x <- O
3011 I
3012
3013 YAAZY (A)*?Z)*?T x <- O
3014 I
3015
3016 YAAZY A)*?Z)*?T x <- O <- I
3017 I
3018
3019 YAAZYA )*?Z)*?T x <- O <- I
3020 I
3021
3022 YAAZYA Z)*?T x <- O # Temporary unset I
3023 I,I
3024
3025 YAAZYAZ )*?T x <- O
3026 I,I
3027
3028 YAAZYAZ T x # Temporary unset O
3029 O
3030 I,I
3031
3032 YAAZYAZT x
3033 O
3034 I,I
3035 *******************************************************************/
a0d0e21e
LW
3036 case CURLYX: {
3037 CURCUR cc;
3280af22 3038 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3039 /* No need to save/restore up to this paren */
3040 I32 parenfloor = scan->flags;
c277df42
IZ
3041
3042 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3043 next += ARG(next);
3280af22
NIS
3044 cc.oldcc = PL_regcc;
3045 PL_regcc = &cc;
cb434fcc
IZ
3046 /* XXXX Probably it is better to teach regpush to support
3047 parenfloor > PL_regsize... */
eb160463 3048 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3049 parenfloor = *PL_reglastparen; /* Pessimization... */
3050 cc.parenfloor = parenfloor;
a0d0e21e
LW
3051 cc.cur = -1;
3052 cc.min = ARG1(scan);
3053 cc.max = ARG2(scan);
c277df42 3054 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3055 cc.next = next;
3056 cc.minmod = minmod;
3057 cc.lastloc = 0;
3280af22 3058 PL_reginput = locinput;
a0d0e21e
LW
3059 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3060 regcpblow(cp);
3280af22 3061 PL_regcc = cc.oldcc;
4633a7c4 3062 saySAME(n);
a0d0e21e
LW
3063 }
3064 /* NOT REACHED */
3065 case WHILEM: {
3066 /*
3067 * This is really hard to understand, because after we match
3068 * what we're trying to match, we must make sure the rest of
2c2d71f5 3069 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3070 * to go back UP the parse tree by recursing ever deeper. And
3071 * if it fails, we have to reset our parent's current state
3072 * that we can try again after backing off.
3073 */
3074
c277df42 3075 CHECKPOINT cp, lastcp;
3280af22 3076 CURCUR* cc = PL_regcc;
c277df42
IZ
3077 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3078
4633a7c4 3079 n = cc->cur + 1; /* how many we know we matched */
3280af22 3080 PL_reginput = locinput;
a0d0e21e 3081
c277df42 3082 DEBUG_r(
9041c2e3 3083 PerlIO_printf(Perl_debug_log,
91f3b821 3084 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3085 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3086 (long)n, (long)cc->min,
2797576d 3087 (long)cc->max, PTR2UV(cc))
c277df42 3088 );
4633a7c4 3089
a0d0e21e
LW
3090 /* If degenerate scan matches "", assume scan done. */
3091
579cf2c3 3092 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3093 PL_regcc = cc->oldcc;
2ab05381
IZ
3094 if (PL_regcc)
3095 ln = PL_regcc->cur;
c277df42 3096 DEBUG_r(
c3464db5
DD
3097 PerlIO_printf(Perl_debug_log,
3098 "%*s empty match detected, try continuation...\n",
3280af22 3099 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3100 );
a0d0e21e 3101 if (regmatch(cc->next))
4633a7c4 3102 sayYES;
2ab05381
IZ
3103 if (PL_regcc)
3104 PL_regcc->cur = ln;
3280af22 3105 PL_regcc = cc;
4633a7c4 3106 sayNO;
a0d0e21e
LW
3107 }
3108
3109 /* First just match a string of min scans. */
3110
3111 if (n < cc->min) {
3112 cc->cur = n;
3113 cc->lastloc = locinput;
4633a7c4
LW
3114 if (regmatch(cc->scan))
3115 sayYES;
3116 cc->cur = n - 1;
c277df42 3117 cc->lastloc = lastloc;
4633a7c4 3118 sayNO;
a0d0e21e
LW
3119 }
3120
2c2d71f5
JH
3121 if (scan->flags) {
3122 /* Check whether we already were at this position.
3123 Postpone detection until we know the match is not
3124 *that* much linear. */
3125 if (!PL_reg_maxiter) {
3126 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3127 PL_reg_leftiter = PL_reg_maxiter;
3128 }
3129 if (PL_reg_leftiter-- == 0) {
3130 I32 size = (PL_reg_maxiter + 7)/8;
3131 if (PL_reg_poscache) {
eb160463 3132 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3133 Renew(PL_reg_poscache, size, char);
3134 PL_reg_poscache_size = size;
3135 }
3136 Zero(PL_reg_poscache, size, char);
3137 }
3138 else {
3139 PL_reg_poscache_size = size;
3140 Newz(29, PL_reg_poscache, size, char);
3141 }
3142 DEBUG_r(
3143 PerlIO_printf(Perl_debug_log,
3144 "%sDetected a super-linear match, switching on caching%s...\n",
3145 PL_colors[4], PL_colors[5])
3146 );
3147 }
3148 if (PL_reg_leftiter < 0) {
3149 I32 o = locinput - PL_bostr, b;
3150
3151 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3152 b = o % 8;
3153 o /= 8;
3154 if (PL_reg_poscache[o] & (1<<b)) {
3155 DEBUG_r(
3156 PerlIO_printf(Perl_debug_log,
3157 "%*s already tried at this position...\n",
3158 REPORT_CODE_OFF+PL_regindent*2, "")
3159 );
7821416a 3160 sayNO_SILENT;
2c2d71f5
JH
3161 }
3162 PL_reg_poscache[o] |= (1<<b);
3163 }
3164 }
3165
a0d0e21e
LW
3166 /* Prefer next over scan for minimal matching. */
3167
3168 if (cc->minmod) {
3280af22 3169 PL_regcc = cc->oldcc;
2ab05381
IZ
3170 if (PL_regcc)
3171 ln = PL_regcc->cur;
5f05dabc 3172 cp = regcppush(cc->parenfloor);
02db2b7b 3173 REGCP_SET(lastcp);
5f05dabc 3174 if (regmatch(cc->next)) {
c277df42 3175 regcpblow(cp);
4633a7c4 3176 sayYES; /* All done. */
5f05dabc 3177 }
02db2b7b 3178 REGCP_UNWIND(lastcp);
5f05dabc 3179 regcppop();
2ab05381
IZ
3180 if (PL_regcc)
3181 PL_regcc->cur = ln;
3280af22 3182 PL_regcc = cc;
a0d0e21e 3183
c277df42 3184 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3185 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3186 && !(PL_reg_flags & RF_warned)) {
3187 PL_reg_flags |= RF_warned;
9014280d 3188 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3189 "Complex regular subexpression recursion",
3190 REG_INFTY - 1);
c277df42 3191 }
4633a7c4 3192 sayNO;
c277df42 3193 }
a687059c 3194
c277df42 3195 DEBUG_r(
c3464db5
DD
3196 PerlIO_printf(Perl_debug_log,
3197 "%*s trying longer...\n",
3280af22 3198 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3199 );
a0d0e21e 3200 /* Try scanning more and see if it helps. */
3280af22 3201 PL_reginput = locinput;
a0d0e21e
LW
3202 cc->cur = n;
3203 cc->lastloc = locinput;
5f05dabc 3204 cp = regcppush(cc->parenfloor);
02db2b7b 3205 REGCP_SET(lastcp);
5f05dabc 3206 if (regmatch(cc->scan)) {
c277df42 3207 regcpblow(cp);
4633a7c4 3208 sayYES;
5f05dabc 3209 }
02db2b7b 3210 REGCP_UNWIND(lastcp);
5f05dabc 3211 regcppop();
4633a7c4 3212 cc->cur = n - 1;
c277df42 3213 cc->lastloc = lastloc;
4633a7c4 3214 sayNO;
a0d0e21e
LW
3215 }
3216
3217 /* Prefer scan over next for maximal matching. */
3218
3219 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3220 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3221 cc->cur = n;
3222 cc->lastloc = locinput;
02db2b7b 3223 REGCP_SET(lastcp);
5f05dabc 3224 if (regmatch(cc->scan)) {
c277df42 3225 regcpblow(cp);
4633a7c4 3226 sayYES;
5f05dabc 3227 }
02db2b7b 3228 REGCP_UNWIND(lastcp);
a0d0e21e 3229 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3230 PL_reginput = locinput;
c277df42 3231 DEBUG_r(
c3464db5
DD
3232 PerlIO_printf(Perl_debug_log,
3233 "%*s failed, try continuation...\n",
3280af22 3234 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3235 );
3236 }
9041c2e3 3237 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3238 && !(PL_reg_flags & RF_warned)) {
3280af22 3239 PL_reg_flags |= RF_warned;
9014280d 3240 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3241 "Complex regular subexpression recursion",
3242 REG_INFTY - 1);
a0d0e21e
LW
3243 }
3244
3245 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3246 PL_regcc = cc->oldcc;
2ab05381
IZ
3247 if (PL_regcc)
3248 ln = PL_regcc->cur;
a0d0e21e 3249 if (regmatch(cc->next))
4633a7c4 3250 sayYES;
2ab05381
IZ
3251 if (PL_regcc)
3252 PL_regcc->cur = ln;
3280af22 3253 PL_regcc = cc;
4633a7c4 3254 cc->cur = n - 1;
c277df42 3255 cc->lastloc = lastloc;
4633a7c4 3256 sayNO;
a0d0e21e
LW
3257 }
3258 /* NOT REACHED */
9041c2e3 3259 case BRANCHJ:
c277df42
IZ
3260 next = scan + ARG(scan);
3261 if (next == scan)
3262 next = NULL;
3263 inner = NEXTOPER(NEXTOPER(scan));
3264 goto do_branch;
9041c2e3 3265 case BRANCH:
c277df42
IZ
3266 inner = NEXTOPER(scan);
3267 do_branch:
3268 {
c277df42
IZ
3269 c1 = OP(scan);
3270 if (OP(next) != c1) /* No choice. */
3271 next = inner; /* Avoid recursion. */
a0d0e21e 3272 else {
02db2b7b
IZ
3273 I32 lastparen = *PL_reglastparen;
3274 I32 unwind1;
3275 re_unwind_branch_t *uw;
3276
3277 /* Put unwinding data on stack */
3278 unwind1 = SSNEWt(1,re_unwind_branch_t);
3279 uw = SSPTRt(unwind1,re_unwind_branch_t);
3280 uw->prev = unwind;
3281 unwind = unwind1;
3282 uw->type = ((c1 == BRANCH)
3283 ? RE_UNWIND_BRANCH
3284 : RE_UNWIND_BRANCHJ);
3285 uw->lastparen = lastparen;
3286 uw->next = next;
3287 uw->locinput = locinput;
3288 uw->nextchr = nextchr;
3289#ifdef DEBUGGING
3290 uw->regindent = ++PL_regindent;
3291#endif
c277df42 3292
02db2b7b
IZ
3293 REGCP_SET(uw->lastcp);
3294
3295 /* Now go into the first branch */
3296 next = inner;
a687059c 3297 }
a0d0e21e
LW
3298 }
3299 break;
3300 case MINMOD:
3301 minmod = 1;
3302 break;
c277df42
IZ
3303 case CURLYM:
3304 {
00db4c45 3305 I32 l = 0;
c277df42 3306 CHECKPOINT lastcp;
9041c2e3 3307
c277df42
IZ
3308 /* We suppose that the next guy does not need
3309 backtracking: in particular, it is of constant length,
3310 and has no parenths to influence future backrefs. */
3311 ln = ARG1(scan); /* min to match */
3312 n = ARG2(scan); /* max to match */
c277df42
IZ
3313 paren = scan->flags;
3314 if (paren) {
3280af22
NIS
3315 if (paren > PL_regsize)
3316 PL_regsize = paren;
eb160463 3317 if (paren > (I32)*PL_reglastparen)
3280af22 3318 *PL_reglastparen = paren;
c277df42 3319 }
dc45a647 3320 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3321 if (paren)
3322 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3323 PL_reginput = locinput;
c277df42
IZ
3324 if (minmod) {
3325 minmod = 0;
3326 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3327 sayNO;
f31a99c8
HS
3328 /* if we matched something zero-length we don't need to
3329 backtrack - capturing parens are already defined, so
3330 the caveat in the maximal case doesn't apply
3331
3332 XXXX if ln == 0, we can redo this check first time
3333 through the following loop
3334 */
3335 if (ln && l == 0)
3336 n = ln; /* don't backtrack */
3280af22 3337 locinput = PL_reginput;
cca55fe3 3338 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3339 regnode *text_node = next;
3340
cca55fe3 3341 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3342
cca55fe3 3343 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3344 else {
cca55fe3
JP
3345 if (PL_regkind[(U8)OP(text_node)] == REF) {
3346 I32 n, ln;
3347 n = ARG(text_node); /* which paren pair */
3348 ln = PL_regstartp[n];
3349 /* assume yes if we haven't seen CLOSEn */
3350 if (
eb160463 3351 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3352 ln == -1 ||
3353 ln == PL_regendp[n]
3354 ) {
3355 c1 = c2 = -1000;
3356 goto assume_ok_MM;
3357 }
3358 c1 = *(PL_bostr + ln);
3359 }
3360 else { c1 = (U8)*STRING(text_node); }
af5decee 3361 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3362 c2 = PL_fold[c1];
af5decee 3363 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3364 c2 = PL_fold_locale[c1];
3365 else
3366 c2 = c1;
3367 }
a0ed51b3
LW
3368 }
3369 else
c277df42 3370 c1 = c2 = -1000;
cca55fe3 3371 assume_ok_MM:
02db2b7b 3372 REGCP_SET(lastcp);
5f4b28b2 3373 /* This may be improved if l == 0. */
c277df42
IZ
3374 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3375 /* If it could work, try it. */
3376 if (c1 == -1000 ||
3280af22
NIS
3377 UCHARAT(PL_reginput) == c1 ||
3378 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3379 {
3380 if (paren) {
f31a99c8 3381 if (ln) {
cf93c79d
IZ
3382 PL_regstartp[paren] =
3383 HOPc(PL_reginput, -l) - PL_bostr;
3384 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3385 }
3386 else
cf93c79d 3387 PL_regendp[paren] = -1;
c277df42
IZ
3388 }
3389 if (regmatch(next))
3390 sayYES;
02db2b7b 3391 REGCP_UNWIND(lastcp);
c277df42
IZ
3392 }
3393 /* Couldn't or didn't -- move forward. */
3280af22 3394 PL_reginput = locinput;
c277df42
IZ
3395 if (regrepeat_hard(scan, 1, &l)) {
3396 ln++;
3280af22 3397 locinput = PL_reginput;
c277df42
IZ
3398 }
3399 else
3400 sayNO;
3401 }
a0ed51b3
LW
3402 }
3403 else {
c277df42 3404 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3405 /* if we matched something zero-length we don't need to
3406 backtrack, unless the minimum count is zero and we
3407 are capturing the result - in that case the capture
3408 being defined or not may affect later execution
3409 */
3410 if (n != 0 && l == 0 && !(paren && ln == 0))
3411 ln = n; /* don't backtrack */
3280af22 3412 locinput = PL_reginput;
c277df42 3413 DEBUG_r(
5c0ca799 3414 PerlIO_printf(Perl_debug_log,
faccc32b 3415 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3416 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3417 (IV) n, (IV)l)
c277df42
IZ
3418 );
3419 if (n >= ln) {
cca55fe3 3420 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3421 regnode *text_node = next;
3422
cca55fe3 3423 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3424
cca55fe3 3425 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3426 else {
cca55fe3
JP
3427 if (PL_regkind[(U8)OP(text_node)] == REF) {
3428 I32 n, ln;
3429 n = ARG(text_node); /* which paren pair */
3430 ln = PL_regstartp[n];
3431 /* assume yes if we haven't seen CLOSEn */
3432 if (
eb160463 3433 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3434 ln == -1 ||
3435 ln == PL_regendp[n]
3436 ) {
3437 c1 = c2 = -1000;
3438 goto assume_ok_REG;
3439 }
3440 c1 = *(PL_bostr + ln);
3441 }
3442 else { c1 = (U8)*STRING(text_node); }
3443
af5decee 3444 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3445 c2 = PL_fold[c1];
af5decee 3446 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3447 c2 = PL_fold_locale[c1];
3448 else
3449 c2 = c1;
3450 }
a0ed51b3
LW
3451 }
3452 else
c277df42
IZ
3453 c1 = c2 = -1000;
3454 }
cca55fe3 3455 assume_ok_REG:
02db2b7b 3456 REGCP_SET(lastcp);
c277df42
IZ
3457 while (n >= ln) {
3458 /* If it could work, try it. */
3459 if (c1 == -1000 ||
3280af22
NIS
3460 UCHARAT(PL_reginput) == c1 ||
3461 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3462 {
3463 DEBUG_r(
c3464db5 3464 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3465 "%*s trying tail with n=%"IVdf"...\n",
3466 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3467 );
3468 if (paren) {
3469 if (n) {
cf93c79d
IZ
3470 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3471 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3472 }
a0ed51b3 3473 else
cf93c79d 3474 PL_regendp[paren] = -1;
c277df42 3475 }
a0ed51b3
LW
3476 if (regmatch(next))
3477 sayYES;
02db2b7b 3478 REGCP_UNWIND(lastcp);
a0ed51b3 3479 }
c277df42
IZ
3480 /* Couldn't or didn't -- back up. */
3481 n--;
dfe13c55 3482 locinput = HOPc(locinput, -l);
3280af22 3483 PL_reginput = locinput;
c277df42
IZ
3484 }
3485 }
3486 sayNO;
3487 break;
3488 }
3489 case CURLYN:
3490 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3491 if (paren > PL_regsize)
3492 PL_regsize = paren;
eb160463 3493 if (paren > (I32)*PL_reglastparen)
3280af22 3494 *PL_reglastparen = paren;
c277df42
IZ
3495 ln = ARG1(scan); /* min to match */
3496 n = ARG2(scan); /* max to match */
dc45a647 3497 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3498 goto repeat;
a0d0e21e 3499 case CURLY:
c277df42 3500 paren = 0;
a0d0e21e
LW
3501 ln = ARG1(scan); /* min to match */
3502 n = ARG2(scan); /* max to match */
dc45a647 3503 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3504 goto repeat;
3505 case STAR:
3506 ln = 0;
c277df42 3507 n = REG_INFTY;
a0d0e21e 3508 scan = NEXTOPER(scan);
c277df42 3509 paren = 0;
a0d0e21e
LW
3510 goto repeat;
3511 case PLUS:
c277df42
IZ
3512 ln = 1;
3513 n = REG_INFTY;
3514 scan = NEXTOPER(scan);
3515 paren = 0;
3516 repeat:
a0d0e21e
LW
3517 /*
3518 * Lookahead to avoid useless match attempts
3519 * when we know what character comes next.
3520 */
5f80c4cf
JP
3521
3522 /*
3523 * Used to only do .*x and .*?x, but now it allows
3524 * for )'s, ('s and (?{ ... })'s to be in the way
3525 * of the quantifier and the EXACT-like node. -- japhy
3526 */
3527
cca55fe3 3528 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3529 U8 *s;
3530 regnode *text_node = next;
3531
cca55fe3 3532 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3533
cca55fe3 3534 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3535 else {
cca55fe3
JP
3536 if (PL_regkind[(U8)OP(text_node)] == REF) {
3537 I32 n, ln;
3538 n = ARG(text_node); /* which paren pair */
3539 ln = PL_regstartp[n];
3540 /* assume yes if we haven't seen CLOSEn */
3541 if (
eb160463 3542 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3543 ln == -1 ||
3544 ln == PL_regendp[n]
3545 ) {
3546 c1 = c2 = -1000;
3547 goto assume_ok_easy;
3548 }
9246c65e 3549 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3550 }
3551 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3552
3553 if (!UTF) {
3554 c2 = c1 = *s;
f65d3ee7 3555 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3556 c2 = PL_fold[c1];
f65d3ee7 3557 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3558 c2 = PL_fold_locale[c1];
1aa99e6b 3559 }
5f80c4cf 3560 else { /* UTF */
f65d3ee7 3561 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3562 STRLEN ulen1, ulen2;
e7ae6809
JH
3563 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3564 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3565
3566 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3567 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3568
872c91ae
JH
3569 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3570 ckWARN(WARN_UTF8) ?
3571 0 : UTF8_ALLOW_ANY);
3572 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3573 ckWARN(WARN_UTF8) ?
3574 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3575 }
3576 else {
872c91ae
JH
3577 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3578 ckWARN(WARN_UTF8) ?
3579 0 : UTF8_ALLOW_ANY);
5f80c4cf 3580 }
1aa99e6b
IH
3581 }
3582 }
bbce6d69 3583 }
a0d0e21e 3584 else
bbce6d69 3585 c1 = c2 = -1000;
cca55fe3 3586 assume_ok_easy:
3280af22 3587 PL_reginput = locinput;
a0d0e21e 3588 if (minmod) {
c277df42 3589 CHECKPOINT lastcp;
a0d0e21e
LW
3590 minmod = 0;
3591 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3592 sayNO;
a0ed51b3 3593 locinput = PL_reginput;
02db2b7b 3594 REGCP_SET(lastcp);
0fe9bf95 3595 if (c1 != -1000) {
1aa99e6b 3596 char *e; /* Should not check after this */
0fe9bf95 3597 char *old = locinput;
b2f2f093 3598 int count = 0;
0fe9bf95 3599
1aa99e6b 3600 if (n == REG_INFTY) {
0fe9bf95 3601 e = PL_regeol - 1;
1aa99e6b
IH
3602 if (do_utf8)
3603 while (UTF8_IS_CONTINUATION(*(U8*)e))
3604 e--;
3605 }
3606 else if (do_utf8) {
3607 int m = n - ln;
3608 for (e = locinput;
3609 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3610 e += UTF8SKIP(e);
3611 }
3612 else {
3613 e = locinput + n - ln;
3614 if (e >= PL_regeol)
3615 e = PL_regeol - 1;
3616 }
0fe9bf95
IZ
3617 while (1) {
3618 /* Find place 'next' could work */
1aa99e6b
IH
3619 if (!do_utf8) {
3620 if (c1 == c2) {
a8e8ab15
JH
3621 while (locinput <= e &&
3622 UCHARAT(locinput) != c1)
1aa99e6b
IH
3623 locinput++;
3624 } else {
9041c2e3 3625 while (locinput <= e
a8e8ab15
JH
3626 && UCHARAT(locinput) != c1
3627 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3628 locinput++;
3629 }
3630 count = locinput - old;
3631 }
3632 else {
3633 STRLEN len;
3634 if (c1 == c2) {
872c91ae
JH
3635 /* count initialised to
3636 * utf8_distance(old, locinput) */
b2f2f093 3637 while (locinput <= e &&
872c91ae
JH
3638 utf8n_to_uvchr((U8*)locinput,
3639 UTF8_MAXLEN, &len,
3640 ckWARN(WARN_UTF8) ?
eb160463 3641 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3642 locinput += len;
b2f2f093
JH
3643 count++;
3644 }
1aa99e6b 3645 } else {
872c91ae
JH
3646 /* count initialised to
3647 * utf8_distance(old, locinput) */
b2f2f093 3648 while (locinput <= e) {
872c91ae
JH
3649 UV c = utf8n_to_uvchr((U8*)locinput,
3650 UTF8_MAXLEN, &len,
3651 ckWARN(WARN_UTF8) ?
3652 0 : UTF8_ALLOW_ANY);
eb160463 3653 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3654 break;
b2f2f093
JH
3655 locinput += len;
3656 count++;
1aa99e6b
IH
3657 }
3658 }
0fe9bf95 3659 }
9041c2e3 3660 if (locinput > e)
0fe9bf95
IZ
3661 sayNO;
3662 /* PL_reginput == old now */
3663 if (locinput != old) {
3664 ln = 1; /* Did some */
1aa99e6b 3665 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3666 sayNO;
3667 }
3668 /* PL_reginput == locinput now */
29d1e993 3669 TRYPAREN(paren, ln, locinput);
0fe9bf95 3670 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3671 REGCP_UNWIND(lastcp);
0fe9bf95 3672 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3673 old = locinput;
3674 if (do_utf8)
3675 locinput += UTF8SKIP(locinput);
3676 else
3677 locinput++;
b2f2f093 3678 count = 1;
0fe9bf95
IZ
3679 }
3680 }
3681 else
c277df42 3682 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3683 UV c;
3684 if (c1 != -1000) {
3685 if (do_utf8)
872c91ae
JH
3686 c = utf8n_to_uvchr((U8*)PL_reginput,
3687 UTF8_MAXLEN, 0,
3688 ckWARN(WARN_UTF8) ?
3689 0 : UTF8_ALLOW_ANY);
1aa99e6b 3690 else
9041c2e3 3691 c = UCHARAT(PL_reginput);
2390ecbc 3692 /* If it could work, try it. */
eb160463 3693 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3694 {
ecc99935 3695 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3696 REGCP_UNWIND(lastcp);
3697 }
1aa99e6b 3698 }
a0d0e21e 3699 /* If it could work, try it. */
2390ecbc 3700 else if (c1 == -1000)
bbce6d69 3701 {
ecc99935 3702 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3703 REGCP_UNWIND(lastcp);
bbce6d69 3704 }
c277df42 3705 /* Couldn't or didn't -- move forward. */
a0ed51b3 3706 PL_reginput = locinput;
a0d0e21e
LW
3707 if (regrepeat(scan, 1)) {
3708 ln++;
a0ed51b3
LW
3709 locinput = PL_reginput;
3710 }
3711 else
4633a7c4 3712 sayNO;
a0d0e21e
LW
3713 }
3714 }
3715 else {
c277df42 3716 CHECKPOINT lastcp;
a0d0e21e 3717 n = regrepeat(scan, n);
a0ed51b3 3718 locinput = PL_reginput;
22c35a8c 3719 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3720 ((!PL_multiline && OP(next) != MEOL) ||
3721 OP(next) == SEOL || OP(next) == EOS))
3722 {
a0d0e21e 3723 ln = n; /* why back off? */
1aeab75a
GS
3724 /* ...because $ and \Z can match before *and* after
3725 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3726 We should back off by one in this case. */
3727 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3728 ln--;
3729 }
02db2b7b 3730 REGCP_SET(lastcp);
c277df42 3731 if (paren) {
8fa7f367 3732 UV c = 0;
c277df42 3733 while (n >= ln) {
1aa99e6b
IH
3734 if (c1 != -1000) {
3735 if (do_utf8)
872c91ae
JH
3736 c = utf8n_to_uvchr((U8*)PL_reginput,
3737 UTF8_MAXLEN, 0,
3738 ckWARN(WARN_UTF8) ?
3739 0 : UTF8_ALLOW_ANY);
1aa99e6b 3740 else
9041c2e3 3741 c = UCHARAT(PL_reginput);
1aa99e6b 3742 }
c277df42 3743 /* If it could work, try it. */
eb160463 3744 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3745 {
29d1e993 3746 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3747 REGCP_UNWIND(lastcp);
c277df42
IZ
3748 }
3749 /* Couldn't or didn't -- back up. */
3750 n--;
dfe13c55 3751 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3752 }
a0ed51b3
LW
3753 }
3754 else {
8fa7f367 3755 UV c = 0;
c277df42 3756 while (n >= ln) {
1aa99e6b
IH
3757 if (c1 != -1000) {
3758 if (do_utf8)
872c91ae
JH
3759 c = utf8n_to_uvchr((U8*)PL_reginput,
3760 UTF8_MAXLEN, 0,
3761 ckWARN(WARN_UTF8) ?
3762 0 : UTF8_ALLOW_ANY);
1aa99e6b 3763 else
9041c2e3 3764 c = UCHARAT(PL_reginput);
1aa99e6b 3765 }
c277df42 3766 /* If it could work, try it. */
eb160463 3767 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3768 {
29d1e993 3769 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3770 REGCP_UNWIND(lastcp);
c277df42
IZ
3771 }
3772 /* Couldn't or didn't -- back up. */
3773 n--;
dfe13c55 3774 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3775 }
a0d0e21e
LW
3776 }
3777 }
4633a7c4 3778 sayNO;
c277df42 3779 break;
a0d0e21e 3780 case END:
0f5d15d6
IZ
3781 if (PL_reg_call_cc) {
3782 re_cc_state *cur_call_cc = PL_reg_call_cc;
3783 CURCUR *cctmp = PL_regcc;
3784 regexp *re = PL_reg_re;
3785 CHECKPOINT cp, lastcp;
3786
3787 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3788 REGCP_SET(lastcp);
0f5d15d6
IZ
3789 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3790 the caller. */
3791 PL_reginput = locinput; /* Make position available to
3792 the callcc. */
3793 cache_re(PL_reg_call_cc->re);
3794 PL_regcc = PL_reg_call_cc->cc;
3795 PL_reg_call_cc = PL_reg_call_cc->prev;
3796 if (regmatch(cur_call_cc->node)) {
3797 PL_reg_call_cc = cur_call_cc;
3798 regcpblow(cp);
3799 sayYES;
3800 }
02db2b7b 3801 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3802 regcppop();
3803 PL_reg_call_cc = cur_call_cc;
3804 PL_regcc = cctmp;
3805 PL_reg_re = re;
3806 cache_re(re);
3807
3808 DEBUG_r(
3809 PerlIO_printf(Perl_debug_log,
3810 "%*s continuation failed...\n",
3811 REPORT_CODE_OFF+PL_regindent*2, "")
3812 );
7821416a 3813 sayNO_SILENT;
0f5d15d6 3814 }
7821416a
IZ
3815 if (locinput < PL_regtill) {
3816 DEBUG_r(PerlIO_printf(Perl_debug_log,
3817 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3818 PL_colors[4],
3819 (long)(locinput - PL_reg_starttry),
3820 (long)(PL_regtill - PL_reg_starttry),
3821 PL_colors[5]));
3822 sayNO_FINAL; /* Cannot match: too short. */
3823 }
3824 PL_reginput = locinput; /* put where regtry can find it */
3825 sayYES_FINAL; /* Success! */
7e5428c5 3826 case SUCCEED:
3280af22 3827 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3828 sayYES_LOUD; /* Success! */
c277df42
IZ
3829 case SUSPEND:
3830 n = 1;
9fe1d20c 3831 PL_reginput = locinput;
9041c2e3 3832 goto do_ifmatch;
a0d0e21e 3833 case UNLESSM:
c277df42 3834 n = 0;
a0ed51b3 3835 if (scan->flags) {
efb30f32
HS
3836 s = HOPBACKc(locinput, scan->flags);
3837 if (!s)
3838 goto say_yes;
3839 PL_reginput = s;
a0ed51b3
LW
3840 }
3841 else
3842 PL_reginput = locinput;
c277df42
IZ
3843 goto do_ifmatch;
3844 case IFMATCH:
3845 n = 1;
a0ed51b3 3846 if (scan->flags) {
efb30f32
HS
3847 s = HOPBACKc(locinput, scan->flags);
3848 if (!s)
3849 goto say_no;
3850 PL_reginput = s;
a0ed51b3
LW
3851 }
3852 else
3853 PL_reginput = locinput;
3854
c277df42 3855 do_ifmatch:
c277df42
IZ
3856 inner = NEXTOPER(NEXTOPER(scan));
3857 if (regmatch(inner) != n) {
3858 say_no:
3859 if (logical) {
3860 logical = 0;
3861 sw = 0;
3862 goto do_longjump;
a0ed51b3
LW
3863 }
3864 else
c277df42
IZ
3865 sayNO;
3866 }
3867 say_yes:
3868 if (logical) {
3869 logical = 0;
3870 sw = 1;
3871 }
fe44a5e8 3872 if (OP(scan) == SUSPEND) {
3280af22 3873 locinput = PL_reginput;
565764a8 3874 nextchr = UCHARAT(locinput);
fe44a5e8 3875 }
c277df42
IZ
3876 /* FALL THROUGH. */
3877 case LONGJMP:
3878 do_longjump:
3879 next = scan + ARG(scan);
3880 if (next == scan)
3881 next = NULL;
a0d0e21e
LW
3882 break;
3883 default:
b900a521 3884 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3885 PTR2UV(scan), OP(scan));
cea2e8a9 3886 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3887 }
02db2b7b 3888 reenter:
a0d0e21e
LW
3889 scan = next;
3890 }
a687059c 3891
a0d0e21e
LW
3892 /*
3893 * We get here only if there's trouble -- normally "case END" is
3894 * the terminating point.
3895 */
cea2e8a9 3896 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3897 /*NOTREACHED*/
4633a7c4
LW
3898 sayNO;
3899
7821416a
IZ
3900yes_loud:
3901 DEBUG_r(
3902 PerlIO_printf(Perl_debug_log,
3903 "%*s %scould match...%s\n",
3904 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3905 );
3906 goto yes;
3907yes_final:
3908 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3909 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3910yes:
3911#ifdef DEBUGGING
3280af22 3912 PL_regindent--;
4633a7c4 3913#endif
02db2b7b
IZ
3914
3915#if 0 /* Breaks $^R */
3916 if (unwind)
3917 regcpblow(firstcp);
3918#endif
4633a7c4
LW
3919 return 1;
3920
3921no:
7821416a
IZ
3922 DEBUG_r(
3923 PerlIO_printf(Perl_debug_log,
3924 "%*s %sfailed...%s\n",
3925 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3926 );
3927 goto do_no;
3928no_final:
3929do_no:
02db2b7b
IZ
3930 if (unwind) {
3931 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3932
3933 switch (uw->type) {
3934 case RE_UNWIND_BRANCH:
3935 case RE_UNWIND_BRANCHJ:
3936 {
3937 re_unwind_branch_t *uwb = &(uw->branch);
3938 I32 lastparen = uwb->lastparen;
9041c2e3 3939
02db2b7b
IZ
3940 REGCP_UNWIND(uwb->lastcp);
3941 for (n = *PL_reglastparen; n > lastparen; n--)
3942 PL_regendp[n] = -1;
3943 *PL_reglastparen = n;
3944 scan = next = uwb->next;
9041c2e3
NIS
3945 if ( !scan ||
3946 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3947 ? BRANCH : BRANCHJ) ) { /* Failure */
3948 unwind = uwb->prev;
3949#ifdef DEBUGGING
3950 PL_regindent--;
3951#endif
3952 goto do_no;
3953 }
3954 /* Have more choice yet. Reuse the same uwb. */
3955 /*SUPPRESS 560*/
3956 if ((n = (uwb->type == RE_UNWIND_BRANCH
3957 ? NEXT_OFF(next) : ARG(next))))
3958 next += n;
3959 else
3960 next = NULL; /* XXXX Needn't unwinding in this case... */
3961 uwb->next = next;
3962 next = NEXTOPER(scan);
3963 if (uwb->type == RE_UNWIND_BRANCHJ)
3964 next = NEXTOPER(next);
3965 locinput = uwb->locinput;
3966 nextchr = uwb->nextchr;
3967#ifdef DEBUGGING
3968 PL_regindent = uwb->regindent;
3969#endif
3970
3971 goto reenter;
3972 }
3973 /* NOT REACHED */
3974 default:
3975 Perl_croak(aTHX_ "regexp unwind memory corruption");
3976 }
3977 /* NOT REACHED */
3978 }
4633a7c4 3979#ifdef DEBUGGING
3280af22 3980 PL_regindent--;
4633a7c4 3981#endif
a0d0e21e 3982 return 0;
a687059c
LW
3983}
3984
3985/*
3986 - regrepeat - repeatedly match something simple, report how many
3987 */
3988/*
3989 * [This routine now assumes that it will only match on things of length 1.
3990 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 3991 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 3992 */
76e3520e 3993STATIC I32
cea2e8a9 3994S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 3995{
a0d0e21e 3996 register char *scan;
a0d0e21e 3997 register I32 c;
3280af22 3998 register char *loceol = PL_regeol;
a0ed51b3 3999 register I32 hardcount = 0;
53c4c00c 4000 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4001
3280af22 4002 scan = PL_reginput;
faf11cac
HS
4003 if (max == REG_INFTY)
4004 max = I32_MAX;
4005 else if (max < loceol - scan)
a0d0e21e 4006 loceol = scan + max;
a0d0e21e 4007 switch (OP(p)) {
22c35a8c 4008 case REG_ANY:
1aa99e6b 4009 if (do_utf8) {
ffc61ed2 4010 loceol = PL_regeol;
1aa99e6b 4011 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4012 scan += UTF8SKIP(scan);
4013 hardcount++;
4014 }
4015 } else {
4016 while (scan < loceol && *scan != '\n')
4017 scan++;
a0ed51b3
LW
4018 }
4019 break;
ffc61ed2 4020 case SANY:
def8e4ea
JH
4021 if (do_utf8) {
4022 loceol = PL_regeol;
a0804c9e 4023 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4024 scan += UTF8SKIP(scan);
4025 hardcount++;
4026 }
4027 }
4028 else
4029 scan = loceol;
a0ed51b3 4030 break;
f33976b4
DB
4031 case CANY:
4032 scan = loceol;
4033 break;
bbce6d69 4034 case EXACT: /* length of string is 1 */
cd439c50 4035 c = (U8)*STRING(p);
bbce6d69 4036 while (scan < loceol && UCHARAT(scan) == c)
4037 scan++;
4038 break;
4039 case EXACTF: /* length of string is 1 */
cd439c50 4040 c = (U8)*STRING(p);
bbce6d69 4041 while (scan < loceol &&
22c35a8c 4042 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4043 scan++;
4044 break;
4045 case EXACTFL: /* length of string is 1 */
3280af22 4046 PL_reg_flags |= RF_tainted;
cd439c50 4047 c = (U8)*STRING(p);
bbce6d69 4048 while (scan < loceol &&
22c35a8c 4049 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4050 scan++;
4051 break;
4052 case ANYOF:
ffc61ed2
JH
4053 if (do_utf8) {
4054 loceol = PL_regeol;
1aa99e6b 4055 while (hardcount < max && scan < loceol &&
ba7b4546 4056 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4057 scan += UTF8SKIP(scan);
4058 hardcount++;
4059 }
4060 } else {
7d3e948e 4061 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4062 scan++;
4063 }
a0d0e21e
LW
4064 break;
4065 case ALNUM:
1aa99e6b 4066 if (do_utf8) {
ffc61ed2 4067 loceol = PL_regeol;
8269fa76 4068 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4069 while (hardcount < max && scan < loceol &&
3568d838 4070 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4071 scan += UTF8SKIP(scan);
4072 hardcount++;
4073 }
4074 } else {
4075 while (scan < loceol && isALNUM(*scan))
4076 scan++;
a0ed51b3
LW
4077 }
4078 break;
bbce6d69 4079 case ALNUML:
3280af22 4080 PL_reg_flags |= RF_tainted;
1aa99e6b 4081 if (do_utf8) {
ffc61ed2 4082 loceol = PL_regeol;
1aa99e6b
IH
4083 while (hardcount < max && scan < loceol &&
4084 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4085 scan += UTF8SKIP(scan);
4086 hardcount++;
4087 }
4088 } else {
4089 while (scan < loceol && isALNUM_LC(*scan))
4090 scan++;
a0ed51b3
LW
4091 }
4092 break;
a0d0e21e 4093 case NALNUM:
1aa99e6b 4094 if (do_utf8) {
ffc61ed2 4095 loceol = PL_regeol;
8269fa76 4096 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4097 while (hardcount < max && scan < loceol &&
3568d838 4098 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4099 scan += UTF8SKIP(scan);
4100 hardcount++;
4101 }
4102 } else {
4103 while (scan < loceol && !isALNUM(*scan))
4104 scan++;
a0ed51b3
LW
4105 }
4106 break;
bbce6d69 4107 case NALNUML:
3280af22 4108 PL_reg_flags |= RF_tainted;
1aa99e6b 4109 if (do_utf8) {
ffc61ed2 4110 loceol = PL_regeol;
1aa99e6b
IH
4111 while (hardcount < max && scan < loceol &&
4112 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4113 scan += UTF8SKIP(scan);
4114 hardcount++;
4115 }
4116 } else {
4117 while (scan < loceol && !isALNUM_LC(*scan))
4118 scan++;
a0ed51b3
LW
4119 }
4120 break;
a0d0e21e 4121 case SPACE:
1aa99e6b 4122 if (do_utf8) {
ffc61ed2 4123 loceol = PL_regeol;
8269fa76 4124 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4125 while (hardcount < max && scan < loceol &&
3568d838
JH
4126 (*scan == ' ' ||
4127 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4128 scan += UTF8SKIP(scan);
4129 hardcount++;
4130 }
4131 } else {
4132 while (scan < loceol && isSPACE(*scan))
4133 scan++;
a0ed51b3
LW
4134 }
4135 break;
bbce6d69 4136 case SPACEL:
3280af22 4137 PL_reg_flags |= RF_tainted;
1aa99e6b 4138 if (do_utf8) {
ffc61ed2 4139 loceol = PL_regeol;
1aa99e6b 4140 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4141 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4142 scan += UTF8SKIP(scan);
4143 hardcount++;
4144 }
4145 } else {
4146 while (scan < loceol && isSPACE_LC(*scan))
4147 scan++;
a0ed51b3
LW
4148 }
4149 break;
a0d0e21e 4150 case NSPACE:
1aa99e6b 4151 if (do_utf8) {
ffc61ed2 4152 loceol = PL_regeol;
8269fa76 4153 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4154 while (hardcount < max && scan < loceol &&
3568d838
JH
4155 !(*scan == ' ' ||
4156 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4157 scan += UTF8SKIP(scan);
4158 hardcount++;
4159 }
4160 } else {
4161 while (scan < loceol && !isSPACE(*scan))
4162 scan++;
4163 break;
a0ed51b3 4164 }
bbce6d69 4165 case NSPACEL:
3280af22 4166 PL_reg_flags |= RF_tainted;
1aa99e6b 4167 if (do_utf8) {
ffc61ed2 4168 loceol = PL_regeol;
1aa99e6b 4169 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4170 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4171 scan += UTF8SKIP(scan);
4172 hardcount++;
4173 }
4174 } else {
4175 while (scan < loceol && !isSPACE_LC(*scan))
4176 scan++;
a0ed51b3
LW
4177 }
4178 break;
a0d0e21e 4179 case DIGIT:
1aa99e6b 4180 if (do_utf8) {
ffc61ed2 4181 loceol = PL_regeol;
8269fa76 4182 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4183 while (hardcount < max && scan < loceol &&
3568d838 4184 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4185 scan += UTF8SKIP(scan);
4186 hardcount++;
4187 }
4188 } else {
4189 while (scan < loceol && isDIGIT(*scan))
4190 scan++;
a0ed51b3
LW
4191 }
4192 break;
a0d0e21e 4193 case NDIGIT:
1aa99e6b 4194 if (do_utf8) {
ffc61ed2 4195 loceol = PL_regeol;
8269fa76 4196 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4197 while (hardcount < max && scan < loceol &&
3568d838 4198 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4199 scan += UTF8SKIP(scan);
4200 hardcount++;
4201 }
4202 } else {
4203 while (scan < loceol && !isDIGIT(*scan))
4204 scan++;
a0ed51b3
LW
4205 }
4206 break;
a0d0e21e
LW
4207 default: /* Called on something of 0 width. */
4208 break; /* So match right here or not at all. */
4209 }
a687059c 4210
a0ed51b3
LW
4211 if (hardcount)
4212 c = hardcount;
4213 else
4214 c = scan - PL_reginput;
3280af22 4215 PL_reginput = scan;
a687059c 4216
9041c2e3 4217 DEBUG_r(
c277df42
IZ
4218 {
4219 SV *prop = sv_newmortal();
4220
4221 regprop(prop, p);
9041c2e3
NIS
4222 PerlIO_printf(Perl_debug_log,
4223 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4224 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4225 });
9041c2e3 4226
a0d0e21e 4227 return(c);
a687059c
LW
4228}
4229
4230/*
c277df42 4231 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4232 *
c277df42
IZ
4233 * The repeater is supposed to have constant length.
4234 */
4235
76e3520e 4236STATIC I32
cea2e8a9 4237S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4238{
b7953727 4239 register char *scan = Nullch;
c277df42 4240 register char *start;
3280af22 4241 register char *loceol = PL_regeol;
a0ed51b3 4242 I32 l = 0;
708e3b05 4243 I32 count = 0, res = 1;
a0ed51b3
LW
4244
4245 if (!max)
4246 return 0;
c277df42 4247
3280af22 4248 start = PL_reginput;
53c4c00c 4249 if (PL_reg_match_utf8) {
708e3b05 4250 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4251 if (!count++) {
4252 l = 0;
4253 while (start < PL_reginput) {
4254 l++;
4255 start += UTF8SKIP(start);
4256 }
4257 *lp = l;
4258 if (l == 0)
4259 return max;
4260 }
4261 if (count == max)
4262 return count;
4263 }
4264 }
4265 else {
708e3b05 4266 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4267 if (!count++) {
4268 *lp = l = PL_reginput - start;
4269 if (max != REG_INFTY && l*max < loceol - scan)
4270 loceol = scan + l*max;
4271 if (l == 0)
4272 return max;
c277df42
IZ
4273 }
4274 }
4275 }
708e3b05 4276 if (!res)
3280af22 4277 PL_reginput = scan;
9041c2e3 4278
a0ed51b3 4279 return count;
c277df42
IZ
4280}
4281
4282/*
ffc61ed2
JH
4283- regclass_swash - prepare the utf8 swash
4284*/
4285
4286SV *
9e55ce06 4287Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4288{
9e55ce06
JH
4289 SV *sw = NULL;
4290 SV *si = NULL;
4291 SV *alt = NULL;
ffc61ed2
JH
4292
4293 if (PL_regdata && PL_regdata->count) {
4294 U32 n = ARG(node);
4295
4296 if (PL_regdata->what[n] == 's') {
4297 SV *rv = (SV*)PL_regdata->data[n];
4298 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4299 SV **ary = AvARRAY(av);
9e55ce06 4300 SV **a, **b;
9041c2e3 4301
9e55ce06
JH
4302 /* See the end of regcomp.c:S_reglass() for
4303 * documentation of these array elements. */
4304
b11f357e
JH
4305 si = *ary;
4306 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4307 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4308
ffc61ed2
JH
4309 if (a)
4310 sw = *a;
4311 else if (si && doinit) {
4312 sw = swash_init("utf8", "", si, 1, 0);
4313 (void)av_store(av, 1, sw);
4314 }
9e55ce06
JH
4315 if (b)
4316 alt = *b;
ffc61ed2
JH
4317 }
4318 }
4319
9e55ce06
JH
4320 if (listsvp)
4321 *listsvp = si;
4322 if (altsvp)
4323 *altsvp = alt;
ffc61ed2
JH
4324
4325 return sw;
4326}
4327
4328/*
ba7b4546 4329 - reginclass - determine if a character falls into a character class
832705d4
JH
4330
4331 The n is the ANYOF regnode, the p is the target string, lenp
4332 is pointer to the maximum length of how far to go in the p
4333 (if the lenp is zero, UTF8SKIP(p) is used),
4334 do_utf8 tells whether the target string is in UTF-8.
4335
bbce6d69 4336 */
4337
76e3520e 4338STATIC bool
ba7b4546 4339S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4340{
ffc61ed2 4341 char flags = ANYOF_FLAGS(n);
bbce6d69 4342 bool match = FALSE;
cc07378b 4343 UV c = *p;
ae9ddab8 4344 STRLEN len = 0;
9e55ce06 4345 STRLEN plen;
1aa99e6b 4346
ae9ddab8
JH
4347 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4348 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4349 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4350
0f0076b4 4351 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4352 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4353 if (lenp)
4354 *lenp = 0;
ffc61ed2 4355 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4356 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4357 match = TRUE;
bbce6d69 4358 }
3568d838 4359 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4360 match = TRUE;
ffc61ed2 4361 if (!match) {
9e55ce06
JH
4362 AV *av;
4363 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4364
4365 if (sw) {
3568d838 4366 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4367 match = TRUE;
4368 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4369 if (!match && lenp && av) {
4370 I32 i;
4371
4372 for (i = 0; i <= av_len(av); i++) {
4373 SV* sv = *av_fetch(av, i, FALSE);
4374 STRLEN len;
4375 char *s = SvPV(sv, len);
4376
061b10df 4377 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4378 *lenp = len;
4379 match = TRUE;
4380 break;
4381 }
4382 }
4383 }
4384 if (!match) {
4a623e43
JH
4385 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4386 STRLEN tmplen;
4387
9e55ce06
JH
4388 to_utf8_fold(p, tmpbuf, &tmplen);
4389 if (swash_fetch(sw, tmpbuf, do_utf8))
4390 match = TRUE;
4391 }
ffc61ed2
JH
4392 }
4393 }
bbce6d69 4394 }
9e55ce06 4395 if (match && lenp && *lenp == 0)
0f0076b4 4396 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4397 }
1aa99e6b 4398 if (!match && c < 256) {
ffc61ed2
JH
4399 if (ANYOF_BITMAP_TEST(n, c))
4400 match = TRUE;
4401 else if (flags & ANYOF_FOLD) {
eb160463 4402 U8 f;
a0ed51b3 4403
ffc61ed2
JH
4404 if (flags & ANYOF_LOCALE) {
4405 PL_reg_flags |= RF_tainted;
4406 f = PL_fold_locale[c];
4407 }
4408 else
4409 f = PL_fold[c];
4410 if (f != c && ANYOF_BITMAP_TEST(n, f))
4411 match = TRUE;
4412 }
4413
4414 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4415 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4416 if (
4417 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4418 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4419 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4420 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4421 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4422 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4423 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4424 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4425 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4426 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4427 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4428 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4429 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4430 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4431 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4432 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4433 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4434 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4435 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4436 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4437 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4438 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4439 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4440 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4441 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4442 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4443 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4444 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4445 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4446 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4447 ) /* How's that for a conditional? */
4448 {
4449 match = TRUE;
4450 }
a0ed51b3 4451 }
a0ed51b3
LW
4452 }
4453
a0ed51b3
LW
4454 return (flags & ANYOF_INVERT) ? !match : match;
4455}
161b471a 4456
dfe13c55 4457STATIC U8 *
cea2e8a9 4458S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4459{
1aa99e6b
IH
4460 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4461}
4462
4463STATIC U8 *
4464S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4465{
a0ed51b3 4466 if (off >= 0) {
1aa99e6b 4467 while (off-- && s < lim) {
ffc61ed2 4468 /* XXX could check well-formedness here */
a0ed51b3 4469 s += UTF8SKIP(s);
ffc61ed2 4470 }
a0ed51b3
LW
4471 }
4472 else {
4473 while (off++) {
1aa99e6b 4474 if (s > lim) {
a0ed51b3 4475 s--;
ffc61ed2 4476 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4477 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4478 s--;
ffc61ed2
JH
4479 }
4480 /* XXX could check well-formedness here */
a0ed51b3
LW
4481 }
4482 }
4483 }
4484 return s;
4485}
161b471a 4486
dfe13c55 4487STATIC U8 *
1aa99e6b 4488S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4489{
1aa99e6b
IH
4490 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4491}
4492
4493STATIC U8 *
4494S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4495{
4496 if (off >= 0) {
1aa99e6b 4497 while (off-- && s < lim) {
ffc61ed2 4498 /* XXX could check well-formedness here */
a0ed51b3 4499 s += UTF8SKIP(s);
ffc61ed2 4500 }
a0ed51b3
LW
4501 if (off >= 0)
4502 return 0;
4503 }
4504 else {
4505 while (off++) {
1aa99e6b 4506 if (s > lim) {
a0ed51b3 4507 s--;
ffc61ed2 4508 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4509 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4510 s--;
ffc61ed2
JH
4511 }
4512 /* XXX could check well-formedness here */
a0ed51b3
LW
4513 }
4514 else
4515 break;
4516 }
4517 if (off <= 0)
4518 return 0;
4519 }
4520 return s;
4521}
51371543 4522
51371543 4523static void
acfe0abc 4524restore_pos(pTHX_ void *arg)
51371543 4525{
51371543
GS
4526 if (PL_reg_eval_set) {
4527 if (PL_reg_oldsaved) {
4528 PL_reg_re->subbeg = PL_reg_oldsaved;
4529 PL_reg_re->sublen = PL_reg_oldsavedlen;
4530 RX_MATCH_COPIED_on(PL_reg_re);
4531 }
4532 PL_reg_magic->mg_len = PL_reg_oldpos;
4533 PL_reg_eval_set = 0;
4534 PL_curpm = PL_reg_oldcurpm;
4535 }
4536}
33b8afdf
JH
4537
4538STATIC void
4539S_to_utf8_substr(pTHX_ register regexp *prog)
4540{
4541 SV* sv;
4542 if (prog->float_substr && !prog->float_utf8) {
4543 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4544 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4545 sv_utf8_upgrade(sv);
4546 if (SvTAIL(prog->float_substr))
4547 SvTAIL_on(sv);
4548 if (prog->float_substr == prog->check_substr)
4549 prog->check_utf8 = sv;
4550 }
4551 if (prog->anchored_substr && !prog->anchored_utf8) {
4552 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4553 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4554 sv_utf8_upgrade(sv);
4555 if (SvTAIL(prog->anchored_substr))
4556 SvTAIL_on(sv);
4557 if (prog->anchored_substr == prog->check_substr)
4558 prog->check_utf8 = sv;
4559 }
4560}
4561
4562STATIC void
4563S_to_byte_substr(pTHX_ register regexp *prog)
4564{
4565 SV* sv;
4566 if (prog->float_utf8 && !prog->float_substr) {
4567 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4568 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4569 if (sv_utf8_downgrade(sv, TRUE)) {
4570 if (SvTAIL(prog->float_utf8))
4571 SvTAIL_on(sv);
4572 } else {
4573 SvREFCNT_dec(sv);
4574 prog->float_substr = sv = &PL_sv_undef;
4575 }
4576 if (prog->float_utf8 == prog->check_utf8)
4577 prog->check_substr = sv;
4578 }
4579 if (prog->anchored_utf8 && !prog->anchored_substr) {
4580 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4581 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4582 if (sv_utf8_downgrade(sv, TRUE)) {
4583 if (SvTAIL(prog->anchored_utf8))
4584 SvTAIL_on(sv);
4585 } else {
4586 SvREFCNT_dec(sv);
4587 prog->anchored_substr = sv = &PL_sv_undef;
4588 }
4589 if (prog->anchored_utf8 == prog->check_utf8)
4590 prog->check_substr = sv;
4591 }
4592}