This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove bad advice : -M doesn't work on the #! line
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
be3c0a43 70 **** Copyright (c) 1991-2002, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
eb160463 90#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
ba7b4546 99#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 100
c277df42
IZ
101/*
102 * Forwards.
103 */
104
33b8afdf 105#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 106#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 107
dfe13c55
GS
108#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
109#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
110#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
111#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
112#define HOPc(pos,off) ((char*)HOP(pos,off))
113#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 114
efb30f32 115#define HOPBACK(pos, off) ( \
e54858b0 116 (PL_reg_match_utf8) \
efb30f32
HS
117 ? reghopmaybe((U8*)pos, -off) \
118 : (pos - off >= PL_bostr) \
119 ? (U8*)(pos - off) \
120 : (U8*)NULL \
121)
122#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
123
1aa99e6b
IH
124#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
125#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
127#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
130
977e4b84 131#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 132
5f80c4cf 133/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
134#define JUMPABLE(rn) ( \
135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
137 OP(rn) == PLUS || OP(rn) == MINMOD || \
138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
139)
140
cca55fe3
JP
141#define HAS_TEXT(rn) ( \
142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
143)
e2d8ce26 144
a84d97b6
HS
145/*
146 Search for mandatory following text node; for lookahead, the text must
147 follow but for lookbehind (rn->flags != 0) we skip to the next step.
148*/
cca55fe3 149#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 150 while (JUMPABLE(rn)) \
a84d97b6 151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 152 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
153 else if (OP(rn) == PLUS) \
154 rn = NEXTOPER(rn); \
a84d97b6
HS
155 else if (OP(rn) == IFMATCH) \
156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 157 else rn += NEXT_OFF(rn); \
5f80c4cf 158} STMT_END
74750237 159
acfe0abc 160static void restore_pos(pTHX_ void *arg);
51371543 161
76e3520e 162STATIC CHECKPOINT
cea2e8a9 163S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 164{
3280af22 165 int retval = PL_savestack_ix;
b1ce53c5
JH
166#define REGCP_PAREN_ELEMS 4
167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
168 int p;
169
e49a9654
IH
170 if (paren_elems_to_push < 0)
171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
172
a01268b5 173#define REGCP_OTHER_ELEMS 6
b1ce53c5 174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 175 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 176/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
177 SSPUSHINT(PL_regendp[p]);
178 SSPUSHINT(PL_regstartp[p]);
3280af22 179 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
180 SSPUSHINT(p);
181 }
b1ce53c5 182/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
183 SSPUSHINT(PL_regsize);
184 SSPUSHINT(*PL_reglastparen);
a01268b5 185 SSPUSHINT(*PL_reglastcloseparen);
3280af22 186 SSPUSHPTR(PL_reginput);
41123dfd
JH
187#define REGCP_FRAME_ELEMS 2
188/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
189 * are needed for the regexp context stack bookkeeping. */
190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 192
a0d0e21e
LW
193 return retval;
194}
195
c277df42 196/* These are needed since we do not localize EVAL nodes: */
02db2b7b 197# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 199 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 200
02db2b7b 201# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 202 PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 205
76e3520e 206STATIC char *
cea2e8a9 207S_regcppop(pTHX)
a0d0e21e 208{
b1ce53c5 209 I32 i;
a0d0e21e
LW
210 U32 paren = 0;
211 char *input;
cf93c79d 212 I32 tmps;
b1ce53c5
JH
213
214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 215 i = SSPOPINT;
b1ce53c5
JH
216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
217 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 218 input = (char *) SSPOPPTR;
a01268b5 219 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
220 *PL_reglastparen = SSPOPINT;
221 PL_regsize = SSPOPINT;
b1ce53c5
JH
222
223 /* Now restore the parentheses context. */
41123dfd
JH
224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
225 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 226 paren = (U32)SSPOPINT;
3280af22 227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
228 PL_regstartp[paren] = SSPOPINT;
229 tmps = SSPOPINT;
3280af22
NIS
230 if (paren <= *PL_reglastparen)
231 PL_regendp[paren] = tmps;
c277df42 232 DEBUG_r(
c3464db5 233 PerlIO_printf(Perl_debug_log,
b900a521 234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 235 (UV)paren, (IV)PL_regstartp[paren],
b900a521 236 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 237 (IV)PL_regendp[paren],
3280af22 238 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 239 );
a0d0e21e 240 }
c277df42 241 DEBUG_r(
eb160463 242 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 243 PerlIO_printf(Perl_debug_log,
faccc32b
JH
244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
246 }
247 );
daf18116 248#if 1
dafc8851
JH
249 /* It would seem that the similar code in regtry()
250 * already takes care of this, and in fact it is in
251 * a better location to since this code can #if 0-ed out
252 * but the code in regtry() is needed or otherwise tests
253 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
254 * (as of patchlevel 7877) will fail. Then again,
255 * this code seems to be necessary or otherwise
256 * building DynaLoader will fail:
257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
258 * --jhi */
eb160463
GS
259 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
260 if ((I32)paren > PL_regsize)
cf93c79d
IZ
261 PL_regstartp[paren] = -1;
262 PL_regendp[paren] = -1;
a0d0e21e 263 }
dafc8851 264#endif
a0d0e21e
LW
265 return input;
266}
267
0f5d15d6 268STATIC char *
cea2e8a9 269S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
270{
271 I32 tmp = PL_savestack_ix;
272
273 PL_savestack_ix = ss;
274 regcppop();
275 PL_savestack_ix = tmp;
942e002e 276 return Nullch;
0f5d15d6
IZ
277}
278
279typedef struct re_cc_state
280{
281 I32 ss;
282 regnode *node;
283 struct re_cc_state *prev;
284 CURCUR *cc;
285 regexp *re;
286} re_cc_state;
287
02db2b7b 288#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 289
29d1e993
HS
290#define TRYPAREN(paren, n, input) { \
291 if (paren) { \
292 if (n) { \
293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
294 PL_regendp[paren] = input - PL_bostr; \
295 } \
296 else \
297 PL_regendp[paren] = -1; \
298 } \
299 if (regmatch(next)) \
300 sayYES; \
301 if (paren && n) \
302 PL_regendp[paren] = -1; \
303}
304
305
a687059c 306/*
e50aee73 307 * pregexec and friends
a687059c
LW
308 */
309
310/*
c277df42 311 - pregexec - match a regexp against a string
a687059c 312 */
c277df42 313I32
864dbfa3 314Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 315 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
316/* strend: pointer to null at end of string */
317/* strbeg: real beginning of string */
318/* minend: end of match must be >=minend after stringarg. */
319/* nosave: For optimizations. */
320{
321 return
9041c2e3 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
323 nosave ? 0 : REXEC_COPY_STR);
324}
0f5d15d6
IZ
325
326STATIC void
cea2e8a9 327S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
328{
329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
330#ifdef DEBUGGING
331 PL_regprogram = prog->program;
332#endif
333 PL_regnpar = prog->nparens;
9041c2e3
NIS
334 PL_regdata = prog->data;
335 PL_reg_re = prog;
0f5d15d6 336}
22e551b9 337
9041c2e3 338/*
cad2e5aa
JH
339 * Need to implement the following flags for reg_anch:
340 *
341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
342 * USE_INTUIT_ML
343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
344 * INTUIT_AUTORITATIVE_ML
345 * INTUIT_ONCE_NOML - Intuit can match in one location only.
346 * INTUIT_ONCE_ML
347 *
348 * Another flag for this function: SECOND_TIME (so that float substrs
349 * with giant delta may be not rechecked).
350 */
351
352/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
353
2c2d71f5 354/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
355 Otherwise, only SvCUR(sv) is used to get strbeg. */
356
357/* XXXX We assume that strpos is strbeg unless sv. */
358
6eb5f6b9
JH
359/* XXXX Some places assume that there is a fixed substring.
360 An update may be needed if optimizer marks as "INTUITable"
361 RExen without fixed substrings. Similarly, it is assumed that
362 lengths of all the strings are no more than minlen, thus they
363 cannot come from lookahead.
364 (Or minlen should take into account lookahead.) */
365
2c2d71f5
JH
366/* A failure to find a constant substring means that there is no need to make
367 an expensive call to REx engine, thus we celebrate a failure. Similarly,
368 finding a substring too deep into the string means that less calls to
30944b6d
IZ
369 regtry() should be needed.
370
371 REx compiler's optimizer found 4 possible hints:
372 a) Anchored substring;
373 b) Fixed substring;
374 c) Whether we are anchored (beginning-of-line or \G);
375 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 376 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
377 string which does not contradict any of them.
378 */
2c2d71f5 379
6eb5f6b9
JH
380/* Most of decisions we do here should have been done at compile time.
381 The nodes of the REx which we used for the search should have been
382 deleted from the finite automaton. */
383
cad2e5aa
JH
384char *
385Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
386 char *strend, U32 flags, re_scream_pos_data *data)
387{
b7953727 388 register I32 start_shift = 0;
cad2e5aa 389 /* Should be nonnegative! */
b7953727 390 register I32 end_shift = 0;
2c2d71f5
JH
391 register char *s;
392 register SV *check;
a1933d95 393 char *strbeg;
cad2e5aa 394 char *t;
33b8afdf 395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 396 I32 ml_anch;
6eb5f6b9 397 register char *other_last = Nullch; /* other substr checked before this */
b7953727 398 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
399#ifdef DEBUGGING
400 char *i_strpos = strpos;
ce333219 401 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 402#endif
cad2e5aa 403
b8d68ded
JH
404 if (prog->reganch & ROPT_UTF8) {
405 DEBUG_r(PerlIO_printf(Perl_debug_log,
406 "UTF-8 regex...\n"));
407 PL_reg_flags |= RF_utf8;
408 }
409
2a782b5b 410 DEBUG_r({
b8d68ded 411 char *s = PL_reg_match_utf8 ?
c728cb41
JH
412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
413 strpos;
b8d68ded
JH
414 int len = PL_reg_match_utf8 ?
415 strlen(s) : strend - strpos;
2a782b5b
JH
416 if (!PL_colorset)
417 reginitcolors();
b8d68ded
JH
418 if (PL_reg_match_utf8)
419 DEBUG_r(PerlIO_printf(Perl_debug_log,
420 "UTF-8 target...\n"));
2a782b5b
JH
421 PerlIO_printf(Perl_debug_log,
422 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
423 PL_colors[4],PL_colors[5],PL_colors[0],
424 prog->precomp,
425 PL_colors[1],
426 (strlen(prog->precomp) > 60 ? "..." : ""),
427 PL_colors[0],
428 (int)(len > 60 ? 60 : len),
429 s, PL_colors[1],
430 (len > 60 ? "..." : "")
431 );
432 });
cad2e5aa 433
c344f387
JH
434 /* CHR_DIST() would be more correct here but it makes things slow. */
435 if (prog->minlen > strend - strpos) {
a72c7584
JH
436 DEBUG_r(PerlIO_printf(Perl_debug_log,
437 "String too short... [re_intuit_start]\n"));
cad2e5aa 438 goto fail;
2c2d71f5 439 }
a1933d95 440 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 441 PL_regeol = strend;
33b8afdf
JH
442 if (do_utf8) {
443 if (!prog->check_utf8 && prog->check_substr)
444 to_utf8_substr(prog);
445 check = prog->check_utf8;
446 } else {
447 if (!prog->check_substr && prog->check_utf8)
448 to_byte_substr(prog);
449 check = prog->check_substr;
450 }
451 if (check == &PL_sv_undef) {
452 DEBUG_r(PerlIO_printf(Perl_debug_log,
453 "Non-utf string cannot match utf check string\n"));
454 goto fail;
455 }
2c2d71f5 456 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
457 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
458 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 459 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 460
7e25d62c
JH
461 if (!ml_anch) {
462 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
463 | ROPT_IMPLICIT)) /* not a real BOL */
464 /* SvCUR is not set on references: SvRV and SvPVX overlap */
465 && sv && !SvROK(sv)
466 && (strpos != strbeg)) {
467 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
468 goto fail;
469 }
470 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 471 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 472 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
473 I32 slen;
474
1aa99e6b 475 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
476 if (SvTAIL(check)) {
477 slen = SvCUR(check); /* >= 1 */
cad2e5aa 478
9041c2e3 479 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
480 || (strend - s == slen && strend[-1] != '\n')) {
481 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
482 goto fail_finish;
cad2e5aa
JH
483 }
484 /* Now should match s[0..slen-2] */
485 slen--;
653099ff 486 if (slen && (*SvPVX(check) != *s
cad2e5aa 487 || (slen > 1
653099ff 488 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
489 report_neq:
490 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
491 goto fail_finish;
492 }
cad2e5aa 493 }
653099ff
GS
494 else if (*SvPVX(check) != *s
495 || ((slen = SvCUR(check)) > 1
496 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
497 goto report_neq;
498 goto success_at_start;
7e25d62c 499 }
cad2e5aa 500 }
2c2d71f5 501 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 502 s = strpos;
2c2d71f5 503 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 504 end_shift = prog->minlen - start_shift -
653099ff 505 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 506 if (!ml_anch) {
653099ff
GS
507 I32 end = prog->check_offset_max + CHR_SVLEN(check)
508 - (SvTAIL(check) != 0);
1aa99e6b 509 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
510
511 if (end_shift < eshift)
512 end_shift = eshift;
513 }
cad2e5aa 514 }
2c2d71f5 515 else { /* Can match at random position */
cad2e5aa
JH
516 ml_anch = 0;
517 s = strpos;
2c2d71f5
JH
518 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
519 /* Should be nonnegative! */
520 end_shift = prog->minlen - start_shift -
653099ff 521 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
522 }
523
2c2d71f5 524#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 525 if (end_shift < 0)
6bbae5e6 526 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
527#endif
528
2c2d71f5
JH
529 restart:
530 /* Find a possible match in the region s..strend by looking for
531 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 532 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
533 I32 p = -1; /* Internal iterator of scream. */
534 I32 *pp = data ? data->scream_pos : &p;
535
2c2d71f5
JH
536 if (PL_screamfirst[BmRARE(check)] >= 0
537 || ( BmRARE(check) == '\n'
538 && (BmPREVIOUS(check) == SvCUR(check) - 1)
539 && SvTAIL(check) ))
9041c2e3 540 s = screaminstr(sv, check,
2c2d71f5 541 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 542 else
2c2d71f5 543 goto fail_finish;
cad2e5aa
JH
544 if (data)
545 *data->scream_olds = s;
546 }
f33976b4 547 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
548 s = fbm_instr((U8*)(s + start_shift),
549 (U8*)(strend - end_shift),
550 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 551 else
1aa99e6b
IH
552 s = fbm_instr(HOP3(s, start_shift, strend),
553 HOP3(strend, -end_shift, strbeg),
2c2d71f5 554 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
555
556 /* Update the count-of-usability, remove useless subpatterns,
557 unshift s. */
2c2d71f5
JH
558
559 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
560 (s ? "Found" : "Did not find"),
33b8afdf 561 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 562 PL_colors[0],
7b0972df
JH
563 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
564 SvPVX(check),
2c2d71f5
JH
565 PL_colors[1], (SvTAIL(check) ? "$" : ""),
566 (s ? " at offset " : "...\n") ) );
567
568 if (!s)
569 goto fail_finish;
570
6eb5f6b9
JH
571 check_at = s;
572
2c2d71f5 573 /* Finish the diagnostic message */
30944b6d 574 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
575
576 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
577 Start with the other substr.
578 XXXX no SCREAM optimization yet - and a very coarse implementation
579 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
580 *always* match. Probably should be marked during compile...
581 Probably it is right to do no SCREAM here...
582 */
583
33b8afdf 584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 585 /* Take into account the "other" substring. */
2c2d71f5
JH
586 /* XXXX May be hopelessly wrong for UTF... */
587 if (!other_last)
6eb5f6b9 588 other_last = strpos;
33b8afdf 589 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
590 do_other_anchored:
591 {
1aa99e6b 592 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 593 char *s1 = s;
33b8afdf 594 SV* must;
2c2d71f5 595
2c2d71f5
JH
596 t = s - prog->check_offset_max;
597 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
598 && (!(prog->reganch & ROPT_UTF8)
1aa99e6b 599 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 600 && t > strpos)))
30944b6d 601 /* EMPTY */;
2c2d71f5
JH
602 else
603 t = strpos;
1aa99e6b 604 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
605 if (t < other_last) /* These positions already checked */
606 t = other_last;
1aa99e6b 607 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
608 if (last < last1)
609 last1 = last;
610 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
611 /* On end-of-str: see comment below. */
33b8afdf
JH
612 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
613 if (must == &PL_sv_undef) {
614 s = (char*)NULL;
615 DEBUG_r(must = prog->anchored_utf8); /* for debug */
616 }
617 else
618 s = fbm_instr(
619 (unsigned char*)t,
620 HOP3(HOP3(last1, prog->anchored_offset, strend)
621 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
622 must,
623 PL_multiline ? FBMrf_MULTILINE : 0
624 );
1aa99e6b
IH
625 DEBUG_r(PerlIO_printf(Perl_debug_log,
626 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
627 (s ? "Found" : "Contradicts"),
628 PL_colors[0],
33b8afdf
JH
629 (int)(SvCUR(must)
630 - (SvTAIL(must)!=0)),
631 SvPVX(must),
632 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
633 if (!s) {
634 if (last1 >= last2) {
635 DEBUG_r(PerlIO_printf(Perl_debug_log,
636 ", giving up...\n"));
637 goto fail_finish;
638 }
639 DEBUG_r(PerlIO_printf(Perl_debug_log,
640 ", trying floating at offset %ld...\n",
1aa99e6b
IH
641 (long)(HOP3c(s1, 1, strend) - i_strpos)));
642 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
643 s = HOP3c(last, 1, strend);
2c2d71f5
JH
644 goto restart;
645 }
646 else {
647 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 648 (long)(s - i_strpos)));
1aa99e6b
IH
649 t = HOP3c(s, -prog->anchored_offset, strbeg);
650 other_last = HOP3c(s, 1, strend);
30944b6d 651 s = s1;
2c2d71f5
JH
652 if (t == strpos)
653 goto try_at_start;
2c2d71f5
JH
654 goto try_at_offset;
655 }
30944b6d 656 }
2c2d71f5
JH
657 }
658 else { /* Take into account the floating substring. */
33b8afdf
JH
659 char *last, *last1;
660 char *s1 = s;
661 SV* must;
662
663 t = HOP3c(s, -start_shift, strbeg);
664 last1 = last =
665 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
666 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
667 last = HOP3c(t, prog->float_max_offset, strend);
668 s = HOP3c(t, prog->float_min_offset, strend);
669 if (s < other_last)
670 s = other_last;
2c2d71f5 671 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
672 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
673 /* fbm_instr() takes into account exact value of end-of-str
674 if the check is SvTAIL(ed). Since false positives are OK,
675 and end-of-str is not later than strend we are OK. */
676 if (must == &PL_sv_undef) {
677 s = (char*)NULL;
678 DEBUG_r(must = prog->float_utf8); /* for debug message */
679 }
680 else
2c2d71f5 681 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
682 (unsigned char*)last + SvCUR(must)
683 - (SvTAIL(must)!=0),
684 must, PL_multiline ? FBMrf_MULTILINE : 0);
685 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
686 (s ? "Found" : "Contradicts"),
687 PL_colors[0],
688 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
689 SvPVX(must),
690 PL_colors[1], (SvTAIL(must) ? "$" : "")));
691 if (!s) {
692 if (last1 == last) {
2c2d71f5 693 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
694 ", giving up...\n"));
695 goto fail_finish;
2c2d71f5 696 }
33b8afdf
JH
697 DEBUG_r(PerlIO_printf(Perl_debug_log,
698 ", trying anchored starting at offset %ld...\n",
699 (long)(s1 + 1 - i_strpos)));
700 other_last = last;
701 s = HOP3c(t, 1, strend);
702 goto restart;
703 }
704 else {
705 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
706 (long)(s - i_strpos)));
707 other_last = s; /* Fix this later. --Hugo */
708 s = s1;
709 if (t == strpos)
710 goto try_at_start;
711 goto try_at_offset;
712 }
2c2d71f5 713 }
cad2e5aa 714 }
2c2d71f5
JH
715
716 t = s - prog->check_offset_max;
2c2d71f5
JH
717 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
718 && (!(prog->reganch & ROPT_UTF8)
1aa99e6b
IH
719 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
720 && t > strpos))) {
2c2d71f5
JH
721 /* Fixed substring is found far enough so that the match
722 cannot start at strpos. */
723 try_at_offset:
cad2e5aa 724 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
725 /* Eventually fbm_*() should handle this, but often
726 anchored_offset is not 0, so this check will not be wasted. */
727 /* XXXX In the code below we prefer to look for "^" even in
728 presence of anchored substrings. And we search even
729 beyond the found float position. These pessimizations
730 are historical artefacts only. */
731 find_anchor:
2c2d71f5 732 while (t < strend - prog->minlen) {
cad2e5aa 733 if (*t == '\n') {
4ee3650e 734 if (t < check_at - prog->check_offset_min) {
33b8afdf 735 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
736 /* Since we moved from the found position,
737 we definitely contradict the found anchored
30944b6d
IZ
738 substr. Due to the above check we do not
739 contradict "check" substr.
740 Thus we can arrive here only if check substr
741 is float. Redo checking for "other"=="fixed".
742 */
9041c2e3 743 strpos = t + 1;
30944b6d
IZ
744 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
745 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
746 goto do_other_anchored;
747 }
4ee3650e
GS
748 /* We don't contradict the found floating substring. */
749 /* XXXX Why not check for STCLASS? */
cad2e5aa 750 s = t + 1;
2c2d71f5 751 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 752 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
753 goto set_useful;
754 }
4ee3650e
GS
755 /* Position contradicts check-string */
756 /* XXXX probably better to look for check-string
757 than for "\n", so one should lower the limit for t? */
758 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 759 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 760 other_last = strpos = s = t + 1;
cad2e5aa
JH
761 goto restart;
762 }
763 t++;
764 }
2c2d71f5
JH
765 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
766 PL_colors[0],PL_colors[1]));
767 goto fail_finish;
cad2e5aa 768 }
f5952150
GS
769 else {
770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
771 PL_colors[0],PL_colors[1]));
772 }
cad2e5aa
JH
773 s = t;
774 set_useful:
33b8afdf 775 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
776 }
777 else {
f5952150 778 /* The found string does not prohibit matching at strpos,
2c2d71f5 779 - no optimization of calling REx engine can be performed,
f5952150
GS
780 unless it was an MBOL and we are not after MBOL,
781 or a future STCLASS check will fail this. */
2c2d71f5
JH
782 try_at_start:
783 /* Even in this situation we may use MBOL flag if strpos is offset
784 wrt the start of the string. */
05b4157f 785 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 786 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
787 /* May be due to an implicit anchor of m{.*foo} */
788 && !(prog->reganch & ROPT_IMPLICIT))
789 {
cad2e5aa
JH
790 t = strpos;
791 goto find_anchor;
792 }
30944b6d 793 DEBUG_r( if (ml_anch)
f5952150
GS
794 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
795 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 796 );
2c2d71f5 797 success_at_start:
30944b6d 798 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
799 && (do_utf8 ? (
800 prog->check_utf8 /* Could be deleted already */
801 && --BmUSEFUL(prog->check_utf8) < 0
802 && (prog->check_utf8 == prog->float_utf8)
803 ) : (
804 prog->check_substr /* Could be deleted already */
805 && --BmUSEFUL(prog->check_substr) < 0
806 && (prog->check_substr == prog->float_substr)
807 )))
66e933ab 808 {
cad2e5aa 809 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 810 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
811 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
812 if (do_utf8 ? prog->check_substr : prog->check_utf8)
813 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
814 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
815 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 816 check = Nullsv; /* abort */
cad2e5aa 817 s = strpos;
3cf5c195
IZ
818 /* XXXX This is a remnant of the old implementation. It
819 looks wasteful, since now INTUIT can use many
6eb5f6b9 820 other heuristics. */
cad2e5aa
JH
821 prog->reganch &= ~RE_USE_INTUIT;
822 }
823 else
824 s = strpos;
825 }
826
6eb5f6b9
JH
827 /* Last resort... */
828 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
829 if (prog->regstclass) {
830 /* minlen == 0 is possible if regstclass is \b or \B,
831 and the fixed substr is ''$.
832 Since minlen is already taken into account, s+1 is before strend;
833 accidentally, minlen >= 1 guaranties no false positives at s + 1
834 even for \b or \B. But (minlen? 1 : 0) below assumes that
835 regstclass does not come from lookahead... */
836 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
837 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 838 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 839 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 840 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 841 : 1);
33b8afdf 842 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 843 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 844 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
845 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
846 cl_l, strend)
847 : strend);
a1933d95 848 char *startpos = strbeg;
6eb5f6b9
JH
849
850 t = s;
76384e4a 851 if (prog->reganch & ROPT_UTF8) {
ffc61ed2 852 PL_regdata = prog->data;
76384e4a
GS
853 PL_bostr = startpos;
854 }
f33976b4 855 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
856 if (!s) {
857#ifdef DEBUGGING
b7953727 858 char *what = 0;
6eb5f6b9
JH
859#endif
860 if (endpos == strend) {
861 DEBUG_r( PerlIO_printf(Perl_debug_log,
862 "Could not match STCLASS...\n") );
863 goto fail;
864 }
66e933ab
GS
865 DEBUG_r( PerlIO_printf(Perl_debug_log,
866 "This position contradicts STCLASS...\n") );
653099ff
GS
867 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
868 goto fail;
6eb5f6b9 869 /* Contradict one of substrings */
33b8afdf
JH
870 if (prog->anchored_substr || prog->anchored_utf8) {
871 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
872 DEBUG_r( what = "anchored" );
873 hop_and_restart:
1aa99e6b 874 s = HOP3c(t, 1, strend);
66e933ab
GS
875 if (s + start_shift + end_shift > strend) {
876 /* XXXX Should be taken into account earlier? */
877 DEBUG_r( PerlIO_printf(Perl_debug_log,
878 "Could not match STCLASS...\n") );
879 goto fail;
880 }
5e39e1e5
HS
881 if (!check)
882 goto giveup;
6eb5f6b9 883 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 884 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
885 what, (long)(s + start_shift - i_strpos)) );
886 goto restart;
887 }
66e933ab 888 /* Have both, check_string is floating */
6eb5f6b9
JH
889 if (t + start_shift >= check_at) /* Contradicts floating=check */
890 goto retry_floating_check;
891 /* Recheck anchored substring, but not floating... */
9041c2e3 892 s = check_at;
5e39e1e5
HS
893 if (!check)
894 goto giveup;
6eb5f6b9 895 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 896 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
897 (long)(other_last - i_strpos)) );
898 goto do_other_anchored;
899 }
60e71179
GS
900 /* Another way we could have checked stclass at the
901 current position only: */
902 if (ml_anch) {
903 s = t = t + 1;
5e39e1e5
HS
904 if (!check)
905 goto giveup;
60e71179 906 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
907 "Looking for /%s^%s/m starting at offset %ld...\n",
908 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 909 goto try_at_offset;
66e933ab 910 }
33b8afdf 911 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 912 goto fail;
6eb5f6b9
JH
913 /* Check is floating subtring. */
914 retry_floating_check:
915 t = check_at - start_shift;
916 DEBUG_r( what = "floating" );
917 goto hop_and_restart;
918 }
b7953727
JH
919 if (t != s) {
920 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 921 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
922 (long)(t - i_strpos), (long)(s - i_strpos))
923 );
924 }
925 else {
926 DEBUG_r(PerlIO_printf(Perl_debug_log,
927 "Does not contradict STCLASS...\n");
928 );
929 }
6eb5f6b9 930 }
5e39e1e5
HS
931 giveup:
932 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
933 PL_colors[4], (check ? "Guessed" : "Giving up"),
934 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 935 return s;
2c2d71f5
JH
936
937 fail_finish: /* Substring not found */
33b8afdf
JH
938 if (prog->check_substr || prog->check_utf8) /* could be removed already */
939 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 940 fail:
2c2d71f5 941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
942 PL_colors[4],PL_colors[5]));
943 return Nullch;
944}
9661b544 945
6eb5f6b9 946/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
947STATIC char *
948S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 949{
6eb5f6b9
JH
950 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
951 char *m;
d8093b23
G
952 STRLEN ln;
953 unsigned int c1;
954 unsigned int c2;
6eb5f6b9
JH
955 char *e;
956 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 957 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 958
6eb5f6b9
JH
959 /* We know what class it must start with. */
960 switch (OP(c)) {
6eb5f6b9 961 case ANYOF:
a0ed51b3 962 while (s < strend) {
7d3e948e
JH
963 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
964
965 if (do_utf8 ?
ba7b4546 966 reginclass(c, (U8*)s, 0, do_utf8) :
7d3e948e 967 REGINCLASS(c, (U8*)s) ||
ebc501f0
JH
968 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
969 /* The assignment of 2 is intentional:
970 * for the sharp s, the skip is 2. */
971 (skip = SHARP_S_SKIP)
972 )) {
6eb5f6b9
JH
973 if (tmp && (norun || regtry(prog, s)))
974 goto got_it;
975 else
976 tmp = doevery;
a0ed51b3 977 }
e0f9d4a8 978 else
7d3e948e 979 tmp = 1;
e0f9d4a8 980 s += skip;
a0d0e21e 981 }
6eb5f6b9 982 break;
f33976b4
DB
983 case CANY:
984 while (s < strend) {
985 if (tmp && (norun || regtry(prog, s)))
986 goto got_it;
987 else
988 tmp = doevery;
989 s++;
990 }
991 break;
6eb5f6b9
JH
992 case EXACTF:
993 m = STRING(c);
994 ln = STR_LEN(c);
1aa99e6b 995 if (UTF) {
a2a2844f 996 STRLEN ulen1, ulen2;
e7ae6809
JH
997 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
998 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
999
1000 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1001 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1002
872c91ae
JH
1003 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1004 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1005 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1006 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1007 }
1008 else {
1009 c1 = *(U8*)m;
1010 c2 = PL_fold[c1];
1011 }
6eb5f6b9
JH
1012 goto do_exactf;
1013 case EXACTFL:
1014 m = STRING(c);
1015 ln = STR_LEN(c);
d8093b23 1016 c1 = *(U8*)m;
6eb5f6b9
JH
1017 c2 = PL_fold_locale[c1];
1018 do_exactf:
6795a346 1019 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1020
6eb5f6b9
JH
1021 if (norun && e < s)
1022 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1023
60a8b682
JH
1024 /* The idea in the EXACTF* cases is to first find the
1025 * first character of the EXACTF* node and then, if
1026 * necessary, case-insensitively compare the full
1027 * text of the node. The c1 and c2 are the first
1028 * characters (though in Unicode it gets a bit
1029 * more complicated because there are more cases
7f16dd3d
JH
1030 * than just upper and lower: one needs to use
1031 * the so-called folding case for case-insensitive
1032 * matching (called "loose matching" in Unicode).
1033 * ibcmp_utf8() will do just that. */
60a8b682 1034
1aa99e6b 1035 if (do_utf8) {
575cac57
JH
1036 UV c, f;
1037 U8 tmpbuf [UTF8_MAXLEN+1];
1038 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1039 STRLEN len, foldlen;
d7f013c8 1040
09091399 1041 if (c1 == c2) {
1aa99e6b 1042 while (s <= e) {
872c91ae
JH
1043 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1044 ckWARN(WARN_UTF8) ?
1045 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1046 if ( c == c1
1047 && (ln == len ||
66423254 1048 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1049 m, (char **)0, ln, (bool)UTF))
55da9344 1050 && (norun || regtry(prog, s)) )
1aa99e6b 1051 goto got_it;
80aecb99
JH
1052 else {
1053 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1054 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1055 if ( f != c
1056 && (f == c1 || f == c2)
1057 && (ln == foldlen ||
66423254
JH
1058 !ibcmp_utf8((char *) foldbuf,
1059 (char **)0, foldlen, do_utf8,
d07ddd77 1060 m,
eb160463 1061 (char **)0, ln, (bool)UTF))
80aecb99
JH
1062 && (norun || regtry(prog, s)) )
1063 goto got_it;
1064 }
1aa99e6b
IH
1065 s += len;
1066 }
09091399
JH
1067 }
1068 else {
1aa99e6b 1069 while (s <= e) {
872c91ae
JH
1070 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1071 ckWARN(WARN_UTF8) ?
1072 0 : UTF8_ALLOW_ANY);
80aecb99 1073
60a8b682 1074 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1075 * Note that not all the possible combinations
1076 * are handled here: some of them are handled
1077 * by the standard folding rules, and some of
1078 * them (the character class or ANYOF cases)
1079 * are handled during compiletime in
1080 * regexec.c:S_regclass(). */
880bd946
JH
1081 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1082 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1083 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1084
1085 if ( (c == c1 || c == c2)
1086 && (ln == len ||
66423254 1087 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1088 m, (char **)0, ln, (bool)UTF))
55da9344 1089 && (norun || regtry(prog, s)) )
1aa99e6b 1090 goto got_it;
80aecb99
JH
1091 else {
1092 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1093 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1094 if ( f != c
1095 && (f == c1 || f == c2)
1096 && (ln == foldlen ||
a6872d42 1097 !ibcmp_utf8((char *) foldbuf,
66423254 1098 (char **)0, foldlen, do_utf8,
d07ddd77 1099 m,
eb160463 1100 (char **)0, ln, (bool)UTF))
80aecb99
JH
1101 && (norun || regtry(prog, s)) )
1102 goto got_it;
1103 }
1aa99e6b
IH
1104 s += len;
1105 }
09091399 1106 }
1aa99e6b
IH
1107 }
1108 else {
1109 if (c1 == c2)
1110 while (s <= e) {
1111 if ( *(U8*)s == c1
1112 && (ln == 1 || !(OP(c) == EXACTF
1113 ? ibcmp(s, m, ln)
1114 : ibcmp_locale(s, m, ln)))
1115 && (norun || regtry(prog, s)) )
1116 goto got_it;
1117 s++;
1118 }
1119 else
1120 while (s <= e) {
1121 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1122 && (ln == 1 || !(OP(c) == EXACTF
1123 ? ibcmp(s, m, ln)
1124 : ibcmp_locale(s, m, ln)))
1125 && (norun || regtry(prog, s)) )
1126 goto got_it;
1127 s++;
1128 }
b3c9acc1
IZ
1129 }
1130 break;
bbce6d69 1131 case BOUNDL:
3280af22 1132 PL_reg_flags |= RF_tainted;
bbce6d69 1133 /* FALL THROUGH */
a0d0e21e 1134 case BOUND:
ffc61ed2 1135 if (do_utf8) {
12d33761 1136 if (s == PL_bostr)
ffc61ed2
JH
1137 tmp = '\n';
1138 else {
b4f7163a 1139 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1140
b4f7163a 1141 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1142 }
1143 tmp = ((OP(c) == BOUND ?
9041c2e3 1144 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1145 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1146 while (s < strend) {
1147 if (tmp == !(OP(c) == BOUND ?
3568d838 1148 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1149 isALNUM_LC_utf8((U8*)s)))
1150 {
1151 tmp = !tmp;
1152 if ((norun || regtry(prog, s)))
1153 goto got_it;
1154 }
1155 s += UTF8SKIP(s);
a687059c 1156 }
a0d0e21e 1157 }
667bb95a 1158 else {
12d33761 1159 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1160 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1161 while (s < strend) {
1162 if (tmp ==
1163 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1164 tmp = !tmp;
1165 if ((norun || regtry(prog, s)))
1166 goto got_it;
1167 }
1168 s++;
a0ed51b3 1169 }
a0ed51b3 1170 }
6eb5f6b9 1171 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1172 goto got_it;
1173 break;
bbce6d69 1174 case NBOUNDL:
3280af22 1175 PL_reg_flags |= RF_tainted;
bbce6d69 1176 /* FALL THROUGH */
a0d0e21e 1177 case NBOUND:
ffc61ed2 1178 if (do_utf8) {
12d33761 1179 if (s == PL_bostr)
ffc61ed2
JH
1180 tmp = '\n';
1181 else {
b4f7163a 1182 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1183
b4f7163a 1184 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1185 }
1186 tmp = ((OP(c) == NBOUND ?
9041c2e3 1187 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1188 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1189 while (s < strend) {
1190 if (tmp == !(OP(c) == NBOUND ?
3568d838 1191 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1192 isALNUM_LC_utf8((U8*)s)))
1193 tmp = !tmp;
1194 else if ((norun || regtry(prog, s)))
1195 goto got_it;
1196 s += UTF8SKIP(s);
1197 }
a0d0e21e 1198 }
667bb95a 1199 else {
12d33761 1200 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1201 tmp = ((OP(c) == NBOUND ?
1202 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1203 while (s < strend) {
1204 if (tmp ==
1205 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1206 tmp = !tmp;
1207 else if ((norun || regtry(prog, s)))
1208 goto got_it;
1209 s++;
1210 }
a0ed51b3 1211 }
6eb5f6b9 1212 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1213 goto got_it;
1214 break;
a0d0e21e 1215 case ALNUM:
ffc61ed2 1216 if (do_utf8) {
8269fa76 1217 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1218 while (s < strend) {
3568d838 1219 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1220 if (tmp && (norun || regtry(prog, s)))
1221 goto got_it;
1222 else
1223 tmp = doevery;
1224 }
bbce6d69 1225 else
ffc61ed2
JH
1226 tmp = 1;
1227 s += UTF8SKIP(s);
bbce6d69 1228 }
bbce6d69 1229 }
ffc61ed2
JH
1230 else {
1231 while (s < strend) {
1232 if (isALNUM(*s)) {
1233 if (tmp && (norun || regtry(prog, s)))
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
a0ed51b3 1238 else
ffc61ed2
JH
1239 tmp = 1;
1240 s++;
a0ed51b3 1241 }
a0ed51b3
LW
1242 }
1243 break;
bbce6d69 1244 case ALNUML:
3280af22 1245 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1246 if (do_utf8) {
1247 while (s < strend) {
1248 if (isALNUM_LC_utf8((U8*)s)) {
1249 if (tmp && (norun || regtry(prog, s)))
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
a687059c 1254 else
ffc61ed2
JH
1255 tmp = 1;
1256 s += UTF8SKIP(s);
a0d0e21e 1257 }
a0d0e21e 1258 }
ffc61ed2
JH
1259 else {
1260 while (s < strend) {
1261 if (isALNUM_LC(*s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a0ed51b3 1267 else
ffc61ed2
JH
1268 tmp = 1;
1269 s++;
a0ed51b3 1270 }
a0ed51b3
LW
1271 }
1272 break;
a0d0e21e 1273 case NALNUM:
ffc61ed2 1274 if (do_utf8) {
8269fa76 1275 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1276 while (s < strend) {
3568d838 1277 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1278 if (tmp && (norun || regtry(prog, s)))
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
bbce6d69 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s += UTF8SKIP(s);
bbce6d69 1286 }
bbce6d69 1287 }
ffc61ed2
JH
1288 else {
1289 while (s < strend) {
1290 if (!isALNUM(*s)) {
1291 if (tmp && (norun || regtry(prog, s)))
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
a0ed51b3 1296 else
ffc61ed2
JH
1297 tmp = 1;
1298 s++;
a0ed51b3 1299 }
a0ed51b3
LW
1300 }
1301 break;
bbce6d69 1302 case NALNUML:
3280af22 1303 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1304 if (do_utf8) {
1305 while (s < strend) {
1306 if (!isALNUM_LC_utf8((U8*)s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a687059c 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s += UTF8SKIP(s);
a687059c 1315 }
a0d0e21e 1316 }
ffc61ed2
JH
1317 else {
1318 while (s < strend) {
1319 if (!isALNUM_LC(*s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a0ed51b3 1325 else
ffc61ed2
JH
1326 tmp = 1;
1327 s++;
a0ed51b3 1328 }
a0ed51b3
LW
1329 }
1330 break;
a0d0e21e 1331 case SPACE:
ffc61ed2 1332 if (do_utf8) {
8269fa76 1333 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1334 while (s < strend) {
3568d838 1335 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1336 if (tmp && (norun || regtry(prog, s)))
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0d0e21e 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s += UTF8SKIP(s);
2304df62 1344 }
a0d0e21e 1345 }
ffc61ed2
JH
1346 else {
1347 while (s < strend) {
1348 if (isSPACE(*s)) {
1349 if (tmp && (norun || regtry(prog, s)))
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0ed51b3 1354 else
ffc61ed2
JH
1355 tmp = 1;
1356 s++;
a0ed51b3 1357 }
a0ed51b3
LW
1358 }
1359 break;
bbce6d69 1360 case SPACEL:
3280af22 1361 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1362 if (do_utf8) {
1363 while (s < strend) {
1364 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1365 if (tmp && (norun || regtry(prog, s)))
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
bbce6d69 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s += UTF8SKIP(s);
bbce6d69 1373 }
bbce6d69 1374 }
ffc61ed2
JH
1375 else {
1376 while (s < strend) {
1377 if (isSPACE_LC(*s)) {
1378 if (tmp && (norun || regtry(prog, s)))
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
a0ed51b3 1383 else
ffc61ed2
JH
1384 tmp = 1;
1385 s++;
a0ed51b3 1386 }
a0ed51b3
LW
1387 }
1388 break;
a0d0e21e 1389 case NSPACE:
ffc61ed2 1390 if (do_utf8) {
8269fa76 1391 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1392 while (s < strend) {
3568d838 1393 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1394 if (tmp && (norun || regtry(prog, s)))
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0d0e21e 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s += UTF8SKIP(s);
a687059c 1402 }
a0d0e21e 1403 }
ffc61ed2
JH
1404 else {
1405 while (s < strend) {
1406 if (!isSPACE(*s)) {
1407 if (tmp && (norun || regtry(prog, s)))
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0ed51b3 1412 else
ffc61ed2
JH
1413 tmp = 1;
1414 s++;
a0ed51b3 1415 }
a0ed51b3
LW
1416 }
1417 break;
bbce6d69 1418 case NSPACEL:
3280af22 1419 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1420 if (do_utf8) {
1421 while (s < strend) {
1422 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1423 if (tmp && (norun || regtry(prog, s)))
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
bbce6d69 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s += UTF8SKIP(s);
bbce6d69 1431 }
bbce6d69 1432 }
ffc61ed2
JH
1433 else {
1434 while (s < strend) {
1435 if (!isSPACE_LC(*s)) {
1436 if (tmp && (norun || regtry(prog, s)))
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
a0ed51b3 1441 else
ffc61ed2
JH
1442 tmp = 1;
1443 s++;
a0ed51b3 1444 }
a0ed51b3
LW
1445 }
1446 break;
a0d0e21e 1447 case DIGIT:
ffc61ed2 1448 if (do_utf8) {
8269fa76 1449 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1450 while (s < strend) {
3568d838 1451 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1452 if (tmp && (norun || regtry(prog, s)))
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0d0e21e 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s += UTF8SKIP(s);
2b69d0c2 1460 }
a0d0e21e 1461 }
ffc61ed2
JH
1462 else {
1463 while (s < strend) {
1464 if (isDIGIT(*s)) {
1465 if (tmp && (norun || regtry(prog, s)))
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0ed51b3 1470 else
ffc61ed2
JH
1471 tmp = 1;
1472 s++;
a0ed51b3 1473 }
a0ed51b3
LW
1474 }
1475 break;
b8c5462f
JH
1476 case DIGITL:
1477 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1478 if (do_utf8) {
1479 while (s < strend) {
1480 if (isDIGIT_LC_utf8((U8*)s)) {
1481 if (tmp && (norun || regtry(prog, s)))
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
b8c5462f 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s += UTF8SKIP(s);
b8c5462f 1489 }
b8c5462f 1490 }
ffc61ed2
JH
1491 else {
1492 while (s < strend) {
1493 if (isDIGIT_LC(*s)) {
1494 if (tmp && (norun || regtry(prog, s)))
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2
JH
1500 tmp = 1;
1501 s++;
b8c5462f 1502 }
b8c5462f
JH
1503 }
1504 break;
a0d0e21e 1505 case NDIGIT:
ffc61ed2 1506 if (do_utf8) {
8269fa76 1507 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1508 while (s < strend) {
3568d838 1509 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1510 if (tmp && (norun || regtry(prog, s)))
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
a0d0e21e 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s += UTF8SKIP(s);
a687059c 1518 }
a0d0e21e 1519 }
ffc61ed2
JH
1520 else {
1521 while (s < strend) {
1522 if (!isDIGIT(*s)) {
1523 if (tmp && (norun || regtry(prog, s)))
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0ed51b3 1528 else
ffc61ed2
JH
1529 tmp = 1;
1530 s++;
a0ed51b3 1531 }
a0ed51b3
LW
1532 }
1533 break;
b8c5462f
JH
1534 case NDIGITL:
1535 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1536 if (do_utf8) {
1537 while (s < strend) {
1538 if (!isDIGIT_LC_utf8((U8*)s)) {
1539 if (tmp && (norun || regtry(prog, s)))
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
b8c5462f 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s += UTF8SKIP(s);
b8c5462f 1547 }
a0ed51b3 1548 }
ffc61ed2
JH
1549 else {
1550 while (s < strend) {
1551 if (!isDIGIT_LC(*s)) {
1552 if (tmp && (norun || regtry(prog, s)))
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
cf93c79d 1557 else
ffc61ed2
JH
1558 tmp = 1;
1559 s++;
b8c5462f 1560 }
b8c5462f
JH
1561 }
1562 break;
b3c9acc1 1563 default:
3c3eec57
GS
1564 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1565 break;
d6a28714 1566 }
6eb5f6b9
JH
1567 return 0;
1568 got_it:
1569 return s;
1570}
1571
1572/*
1573 - regexec_flags - match a regexp against a string
1574 */
1575I32
1576Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1577 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1578/* strend: pointer to null at end of string */
1579/* strbeg: real beginning of string */
1580/* minend: end of match must be >=minend after stringarg. */
1581/* data: May be used for some additional optimizations. */
1582/* nosave: For optimizations. */
1583{
6eb5f6b9
JH
1584 register char *s;
1585 register regnode *c;
1586 register char *startpos = stringarg;
6eb5f6b9
JH
1587 I32 minlen; /* must match at least this many chars */
1588 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1589 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1590 constant substr. */ /* CC */
1591 I32 end_shift = 0; /* Same for the end. */ /* CC */
1592 I32 scream_pos = -1; /* Internal iterator of scream. */
1593 char *scream_olds;
1594 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1595 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1596#ifdef DEBUGGING
9e55ce06
JH
1597 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1598 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1599#endif
6eb5f6b9
JH
1600
1601 PL_regcc = 0;
1602
1603 cache_re(prog);
1604#ifdef DEBUGGING
aea4f609 1605 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1606#endif
1607
1608 /* Be paranoid... */
1609 if (prog == NULL || startpos == NULL) {
1610 Perl_croak(aTHX_ "NULL regexp parameter");
1611 return 0;
1612 }
1613
1614 minlen = prog->minlen;
61a36c01 1615 if (strend - startpos < minlen) {
a72c7584
JH
1616 DEBUG_r(PerlIO_printf(Perl_debug_log,
1617 "String too short [regexec_flags]...\n"));
1618 goto phooey;
1aa99e6b 1619 }
6eb5f6b9 1620
6eb5f6b9
JH
1621 /* Check validity of program. */
1622 if (UCHARAT(prog->program) != REG_MAGIC) {
1623 Perl_croak(aTHX_ "corrupted regexp program");
1624 }
1625
1626 PL_reg_flags = 0;
1627 PL_reg_eval_set = 0;
1628 PL_reg_maxiter = 0;
1629
1630 if (prog->reganch & ROPT_UTF8)
1631 PL_reg_flags |= RF_utf8;
1632
1633 /* Mark beginning of line for ^ and lookbehind. */
1634 PL_regbol = startpos;
1635 PL_bostr = strbeg;
1636 PL_reg_sv = sv;
1637
1638 /* Mark end of line for $ (and such) */
1639 PL_regeol = strend;
1640
1641 /* see how far we have to get to not match where we matched before */
1642 PL_regtill = startpos+minend;
1643
1644 /* We start without call_cc context. */
1645 PL_reg_call_cc = 0;
1646
1647 /* If there is a "must appear" string, look for it. */
1648 s = startpos;
1649
1650 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1651 MAGIC *mg;
1652
1653 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1654 PL_reg_ganch = startpos;
1655 else if (sv && SvTYPE(sv) >= SVt_PVMG
1656 && SvMAGIC(sv)
14befaf4
DM
1657 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1658 && mg->mg_len >= 0) {
6eb5f6b9
JH
1659 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1660 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1661 if (s > PL_reg_ganch)
6eb5f6b9
JH
1662 goto phooey;
1663 s = PL_reg_ganch;
1664 }
1665 }
1666 else /* pos() not defined */
1667 PL_reg_ganch = strbeg;
1668 }
1669
33b8afdf 1670 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1671 re_scream_pos_data d;
1672
1673 d.scream_olds = &scream_olds;
1674 d.scream_pos = &scream_pos;
1675 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1676 if (!s) {
1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1678 goto phooey; /* not present */
3fa9c3d7 1679 }
6eb5f6b9
JH
1680 }
1681
2a782b5b 1682 DEBUG_r({
9e55ce06
JH
1683 char *s0 = UTF ?
1684 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1685 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1686 prog->precomp;
1687 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1688 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1689 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1690 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1691 if (!PL_colorset)
1692 reginitcolors();
1693 PerlIO_printf(Perl_debug_log,
9e55ce06 1694 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1695 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1696 len0, len0, s0,
2a782b5b 1697 PL_colors[1],
9e55ce06 1698 len0 > 60 ? "..." : "",
2a782b5b 1699 PL_colors[0],
9e55ce06
JH
1700 (int)(len1 > 60 ? 60 : len1),
1701 s1, PL_colors[1],
1702 (len1 > 60 ? "..." : "")
2a782b5b
JH
1703 );
1704 });
6eb5f6b9
JH
1705
1706 /* Simplest case: anchored match need be tried only once. */
1707 /* [unless only anchor is BOL and multiline is set] */
1708 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1709 if (s == startpos && regtry(prog, startpos))
1710 goto got_it;
1711 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1712 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1713 {
1714 char *end;
1715
1716 if (minlen)
1717 dontbother = minlen - 1;
1aa99e6b 1718 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1719 /* for multiline we only have to try after newlines */
33b8afdf 1720 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1721 if (s == startpos)
1722 goto after_try;
1723 while (1) {
1724 if (regtry(prog, s))
1725 goto got_it;
1726 after_try:
1727 if (s >= end)
1728 goto phooey;
1729 if (prog->reganch & RE_USE_INTUIT) {
1730 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1731 if (!s)
1732 goto phooey;
1733 }
1734 else
1735 s++;
1736 }
1737 } else {
1738 if (s > startpos)
1739 s--;
1740 while (s < end) {
1741 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1742 if (regtry(prog, s))
1743 goto got_it;
1744 }
1745 }
1746 }
1747 }
1748 goto phooey;
1749 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1750 if (regtry(prog, PL_reg_ganch))
1751 goto got_it;
1752 goto phooey;
1753 }
1754
1755 /* Messy cases: unanchored match. */
33b8afdf 1756 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1757 /* we have /x+whatever/ */
1758 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1759 char ch;
bf93d4cc
GS
1760#ifdef DEBUGGING
1761 int did_match = 0;
1762#endif
33b8afdf
JH
1763 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1764 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1765 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1766
1aa99e6b 1767 if (do_utf8) {
6eb5f6b9
JH
1768 while (s < strend) {
1769 if (*s == ch) {
bf93d4cc 1770 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1771 if (regtry(prog, s)) goto got_it;
1772 s += UTF8SKIP(s);
1773 while (s < strend && *s == ch)
1774 s += UTF8SKIP(s);
1775 }
1776 s += UTF8SKIP(s);
1777 }
1778 }
1779 else {
1780 while (s < strend) {
1781 if (*s == ch) {
bf93d4cc 1782 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1783 if (regtry(prog, s)) goto got_it;
1784 s++;
1785 while (s < strend && *s == ch)
1786 s++;
1787 }
1788 s++;
1789 }
1790 }
b7953727 1791 DEBUG_r(if (!did_match)
bf93d4cc 1792 PerlIO_printf(Perl_debug_log,
b7953727
JH
1793 "Did not find anchored character...\n")
1794 );
6eb5f6b9
JH
1795 }
1796 /*SUPPRESS 560*/
33b8afdf
JH
1797 else if (prog->anchored_substr != Nullsv
1798 || prog->anchored_utf8 != Nullsv
1799 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1800 && prog->float_max_offset < strend - s)) {
1801 SV *must;
1802 I32 back_max;
1803 I32 back_min;
1804 char *last;
6eb5f6b9 1805 char *last1; /* Last position checked before */
bf93d4cc
GS
1806#ifdef DEBUGGING
1807 int did_match = 0;
1808#endif
33b8afdf
JH
1809 if (prog->anchored_substr || prog->anchored_utf8) {
1810 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1811 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1812 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1813 back_max = back_min = prog->anchored_offset;
1814 } else {
1815 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1816 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1817 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1818 back_max = prog->float_max_offset;
1819 back_min = prog->float_min_offset;
1820 }
1821 if (must == &PL_sv_undef)
1822 /* could not downgrade utf8 check substring, so must fail */
1823 goto phooey;
1824
1825 last = HOP3c(strend, /* Cannot start after this */
1826 -(I32)(CHR_SVLEN(must)
1827 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1828
1829 if (s > PL_bostr)
1830 last1 = HOPc(s, -1);
1831 else
1832 last1 = s - 1; /* bogus */
1833
1834 /* XXXX check_substr already used to find `s', can optimize if
1835 check_substr==must. */
1836 scream_pos = -1;
1837 dontbother = end_shift;
1838 strend = HOPc(strend, -dontbother);
1839 while ( (s <= last) &&
9041c2e3 1840 ((flags & REXEC_SCREAM)
1aa99e6b 1841 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1842 end_shift, &scream_pos, 0))
1aa99e6b 1843 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1844 (unsigned char*)strend, must,
6eb5f6b9 1845 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
bf93d4cc 1846 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1847 if (HOPc(s, -back_max) > last1) {
1848 last1 = HOPc(s, -back_min);
1849 s = HOPc(s, -back_max);
1850 }
1851 else {
1852 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1853
1854 last1 = HOPc(s, -back_min);
1855 s = t;
1856 }
1aa99e6b 1857 if (do_utf8) {
6eb5f6b9
JH
1858 while (s <= last1) {
1859 if (regtry(prog, s))
1860 goto got_it;
1861 s += UTF8SKIP(s);
1862 }
1863 }
1864 else {
1865 while (s <= last1) {
1866 if (regtry(prog, s))
1867 goto got_it;
1868 s++;
1869 }
1870 }
1871 }
b7953727
JH
1872 DEBUG_r(if (!did_match)
1873 PerlIO_printf(Perl_debug_log,
1874 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1875 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1876 ? "anchored" : "floating"),
1877 PL_colors[0],
1878 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1879 SvPVX(must),
b7953727
JH
1880 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1881 );
6eb5f6b9
JH
1882 goto phooey;
1883 }
155aba94 1884 else if ((c = prog->regstclass)) {
66e933ab
GS
1885 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1886 /* don't bother with what can't match */
6eb5f6b9 1887 strend = HOPc(strend, -(minlen - 1));
ffc61ed2
JH
1888 DEBUG_r({
1889 SV *prop = sv_newmortal();
9e55ce06
JH
1890 char *s0;
1891 char *s1;
1892 int len0;
1893 int len1;
1894
ffc61ed2 1895 regprop(prop, c);
9e55ce06
JH
1896 s0 = UTF ?
1897 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1898 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1899 SvPVX(prop);
1900 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1901 s1 = UTF ?
c728cb41 1902 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1903 len1 = UTF ? SvCUR(dsv1) : strend - s;
1904 PerlIO_printf(Perl_debug_log,
1905 "Matching stclass `%*.*s' against `%*.*s'\n",
1906 len0, len0, s0,
1907 len1, len1, s1);
ffc61ed2 1908 });
6eb5f6b9
JH
1909 if (find_byclass(prog, c, s, strend, startpos, 0))
1910 goto got_it;
bf93d4cc 1911 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1912 }
1913 else {
1914 dontbother = 0;
33b8afdf
JH
1915 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1916 /* Trim the end. */
d6a28714 1917 char *last;
33b8afdf
JH
1918 SV* float_real;
1919
1920 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1921 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1922 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1923
1924 if (flags & REXEC_SCREAM) {
33b8afdf 1925 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1926 end_shift, &scream_pos, 1); /* last one */
1927 if (!last)
ffc61ed2 1928 last = scream_olds; /* Only one occurrence. */
b8c5462f 1929 }
d6a28714
JH
1930 else {
1931 STRLEN len;
33b8afdf 1932 char *little = SvPV(float_real, len);
d6a28714 1933
33b8afdf 1934 if (SvTAIL(float_real)) {
d6a28714
JH
1935 if (memEQ(strend - len + 1, little, len - 1))
1936 last = strend - len + 1;
1937 else if (!PL_multiline)
9041c2e3 1938 last = memEQ(strend - len, little, len)
d6a28714 1939 ? strend - len : Nullch;
b8c5462f 1940 else
d6a28714
JH
1941 goto find_last;
1942 } else {
1943 find_last:
9041c2e3 1944 if (len)
d6a28714 1945 last = rninstr(s, strend, little, little + len);
b8c5462f 1946 else
d6a28714 1947 last = strend; /* matching `$' */
b8c5462f 1948 }
b8c5462f 1949 }
bf93d4cc
GS
1950 if (last == NULL) {
1951 DEBUG_r(PerlIO_printf(Perl_debug_log,
1952 "%sCan't trim the tail, match fails (should not happen)%s\n",
1953 PL_colors[4],PL_colors[5]));
1954 goto phooey; /* Should not happen! */
1955 }
d6a28714
JH
1956 dontbother = strend - last + prog->float_min_offset;
1957 }
1958 if (minlen && (dontbother < minlen))
1959 dontbother = minlen - 1;
1960 strend -= dontbother; /* this one's always in bytes! */
1961 /* We don't know much -- general case. */
1aa99e6b 1962 if (do_utf8) {
d6a28714
JH
1963 for (;;) {
1964 if (regtry(prog, s))
1965 goto got_it;
1966 if (s >= strend)
1967 break;
b8c5462f 1968 s += UTF8SKIP(s);
d6a28714
JH
1969 };
1970 }
1971 else {
1972 do {
1973 if (regtry(prog, s))
1974 goto got_it;
1975 } while (s++ < strend);
1976 }
1977 }
1978
1979 /* Failure. */
1980 goto phooey;
1981
1982got_it:
1983 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1984
1985 if (PL_reg_eval_set) {
1986 /* Preserve the current value of $^R */
1987 if (oreplsv != GvSV(PL_replgv))
1988 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1989 restored, the value remains
1990 the same. */
acfe0abc 1991 restore_pos(aTHX_ 0);
d6a28714
JH
1992 }
1993
1994 /* make sure $`, $&, $', and $digit will work later */
1995 if ( !(flags & REXEC_NOT_FIRST) ) {
1996 if (RX_MATCH_COPIED(prog)) {
1997 Safefree(prog->subbeg);
1998 RX_MATCH_COPIED_off(prog);
1999 }
2000 if (flags & REXEC_COPY_STR) {
2001 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2002
2003 s = savepvn(strbeg, i);
2004 prog->subbeg = s;
2005 prog->sublen = i;
2006 RX_MATCH_COPIED_on(prog);
2007 }
2008 else {
2009 prog->subbeg = strbeg;
2010 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2011 }
2012 }
9041c2e3 2013
d6a28714
JH
2014 return 1;
2015
2016phooey:
bf93d4cc
GS
2017 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2018 PL_colors[4],PL_colors[5]));
d6a28714 2019 if (PL_reg_eval_set)
acfe0abc 2020 restore_pos(aTHX_ 0);
d6a28714
JH
2021 return 0;
2022}
2023
2024/*
2025 - regtry - try match at specific point
2026 */
2027STATIC I32 /* 0 failure, 1 success */
2028S_regtry(pTHX_ regexp *prog, char *startpos)
2029{
d6a28714
JH
2030 register I32 i;
2031 register I32 *sp;
2032 register I32 *ep;
2033 CHECKPOINT lastcp;
2034
02db2b7b
IZ
2035#ifdef DEBUGGING
2036 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2037#endif
d6a28714
JH
2038 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2039 MAGIC *mg;
2040
2041 PL_reg_eval_set = RS_init;
2042 DEBUG_r(DEBUG_s(
b900a521
JH
2043 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2044 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2045 ));
e8347627 2046 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2047 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2048 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2049 SAVETMPS;
2050 /* Apparently this is not needed, judging by wantarray. */
e8347627 2051 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2052 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2053
2054 if (PL_reg_sv) {
2055 /* Make $_ available to executed code. */
2056 if (PL_reg_sv != DEFSV) {
4d1ff10f 2057 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2058 SAVESPTR(DEFSV);
2059 DEFSV = PL_reg_sv;
b8c5462f 2060 }
d6a28714 2061
9041c2e3 2062 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2063 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2064 /* prepare for quick setting of pos */
14befaf4
DM
2065 sv_magic(PL_reg_sv, (SV*)0,
2066 PERL_MAGIC_regex_global, Nullch, 0);
2067 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2068 mg->mg_len = -1;
b8c5462f 2069 }
d6a28714
JH
2070 PL_reg_magic = mg;
2071 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2072 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2073 }
09687e5a 2074 if (!PL_reg_curpm) {
0f79a09d 2075 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2076#ifdef USE_ITHREADS
2077 {
2078 SV* repointer = newSViv(0);
577e12cc 2079 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2080 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2081 av_push(PL_regex_padav,repointer);
2082 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2083 PL_regex_pad = AvARRAY(PL_regex_padav);
2084 }
2085#endif
2086 }
aaa362c4 2087 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2088 PL_reg_oldcurpm = PL_curpm;
2089 PL_curpm = PL_reg_curpm;
2090 if (RX_MATCH_COPIED(prog)) {
2091 /* Here is a serious problem: we cannot rewrite subbeg,
2092 since it may be needed if this match fails. Thus
2093 $` inside (?{}) could fail... */
2094 PL_reg_oldsaved = prog->subbeg;
2095 PL_reg_oldsavedlen = prog->sublen;
2096 RX_MATCH_COPIED_off(prog);
2097 }
2098 else
2099 PL_reg_oldsaved = Nullch;
2100 prog->subbeg = PL_bostr;
2101 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2102 }
2103 prog->startp[0] = startpos - PL_bostr;
2104 PL_reginput = startpos;
2105 PL_regstartp = prog->startp;
2106 PL_regendp = prog->endp;
2107 PL_reglastparen = &prog->lastparen;
a01268b5 2108 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714
JH
2109 prog->lastparen = 0;
2110 PL_regsize = 0;
2111 DEBUG_r(PL_reg_starttry = startpos);
2112 if (PL_reg_start_tmpl <= prog->nparens) {
2113 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2114 if(PL_reg_start_tmp)
2115 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2116 else
2117 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2118 }
2119
2120 /* XXXX What this code is doing here?!!! There should be no need
2121 to do this again and again, PL_reglastparen should take care of
3dd2943c 2122 this! --ilya*/
dafc8851
JH
2123
2124 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2125 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2126 * PL_reglastparen), is not needed at all by the test suite
2127 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2128 * enough, for building DynaLoader, or otherwise this
2129 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2130 * will happen. Meanwhile, this code *is* needed for the
2131 * above-mentioned test suite tests to succeed. The common theme
2132 * on those tests seems to be returning null fields from matches.
2133 * --jhi */
dafc8851 2134#if 1
d6a28714
JH
2135 sp = prog->startp;
2136 ep = prog->endp;
2137 if (prog->nparens) {
eb160463 2138 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2139 *++sp = -1;
2140 *++ep = -1;
2141 }
2142 }
dafc8851 2143#endif
02db2b7b 2144 REGCP_SET(lastcp);
d6a28714
JH
2145 if (regmatch(prog->program + 1)) {
2146 prog->endp[0] = PL_reginput - PL_bostr;
2147 return 1;
2148 }
02db2b7b 2149 REGCP_UNWIND(lastcp);
d6a28714
JH
2150 return 0;
2151}
2152
02db2b7b
IZ
2153#define RE_UNWIND_BRANCH 1
2154#define RE_UNWIND_BRANCHJ 2
2155
2156union re_unwind_t;
2157
2158typedef struct { /* XX: makes sense to enlarge it... */
2159 I32 type;
2160 I32 prev;
2161 CHECKPOINT lastcp;
2162} re_unwind_generic_t;
2163
2164typedef struct {
2165 I32 type;
2166 I32 prev;
2167 CHECKPOINT lastcp;
2168 I32 lastparen;
2169 regnode *next;
2170 char *locinput;
2171 I32 nextchr;
2172#ifdef DEBUGGING
2173 int regindent;
2174#endif
2175} re_unwind_branch_t;
2176
2177typedef union re_unwind_t {
2178 I32 type;
2179 re_unwind_generic_t generic;
2180 re_unwind_branch_t branch;
2181} re_unwind_t;
2182
8ba1375e
MJD
2183#define sayYES goto yes
2184#define sayNO goto no
e0f9d4a8 2185#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2186#define sayYES_FINAL goto yes_final
2187#define sayYES_LOUD goto yes_loud
2188#define sayNO_FINAL goto no_final
2189#define sayNO_SILENT goto do_no
2190#define saySAME(x) if (x) goto yes; else goto no
2191
2192#define REPORT_CODE_OFF 24
2193
d6a28714
JH
2194/*
2195 - regmatch - main matching routine
2196 *
2197 * Conceptually the strategy is simple: check to see whether the current
2198 * node matches, call self recursively to see whether the rest matches,
2199 * and then act accordingly. In practice we make some effort to avoid
2200 * recursion, in particular by going through "ordinary" nodes (that don't
2201 * need to know whether the rest of the match failed) by a loop instead of
2202 * by recursion.
2203 */
2204/* [lwall] I've hoisted the register declarations to the outer block in order to
2205 * maybe save a little bit of pushing and popping on the stack. It also takes
2206 * advantage of machines that use a register save mask on subroutine entry.
2207 */
2208STATIC I32 /* 0 failure, 1 success */
2209S_regmatch(pTHX_ regnode *prog)
2210{
d6a28714
JH
2211 register regnode *scan; /* Current node. */
2212 regnode *next; /* Next node. */
2213 regnode *inner; /* Next node in internal branch. */
2214 register I32 nextchr; /* renamed nextchr - nextchar colides with
2215 function of same name */
2216 register I32 n; /* no or next */
b7953727
JH
2217 register I32 ln = 0; /* len or last */
2218 register char *s = Nullch; /* operand or save */
d6a28714 2219 register char *locinput = PL_reginput;
b7953727 2220 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2221 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2222 I32 unwind = 0;
b7953727 2223#if 0
02db2b7b 2224 I32 firstcp = PL_savestack_ix;
b7953727 2225#endif
53c4c00c 2226 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2227#ifdef DEBUGGING
ce333219
JH
2228 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2229 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2230 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2231#endif
02db2b7b 2232
d6a28714
JH
2233#ifdef DEBUGGING
2234 PL_regindent++;
2235#endif
2236
2237 /* Note that nextchr is a byte even in UTF */
2238 nextchr = UCHARAT(locinput);
2239 scan = prog;
2240 while (scan != NULL) {
8ba1375e 2241
2a782b5b 2242 DEBUG_r( {
d6a28714
JH
2243 SV *prop = sv_newmortal();
2244 int docolor = *PL_colors[0];
2245 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2246 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2247 /* The part of the string before starttry has one color
2248 (pref0_len chars), between starttry and current
2249 position another one (pref_len - pref0_len chars),
2250 after the current position the third one.
2251 We assume that pref0_len <= pref_len, otherwise we
2252 decrease pref0_len. */
9041c2e3 2253 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2254 ? (5 + taill) - l : locinput - PL_bostr;
2255 int pref0_len;
d6a28714 2256
df1ffd02 2257 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2258 pref_len++;
2259 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2260 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2261 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2262 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2263 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2264 l--;
d6a28714
JH
2265 if (pref0_len < 0)
2266 pref0_len = 0;
2267 if (pref0_len > pref_len)
2268 pref0_len = pref_len;
2269 regprop(prop, scan);
2a782b5b
JH
2270 {
2271 char *s0 =
df1ffd02 2272 do_utf8 ?
2a782b5b 2273 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2274 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2275 locinput - pref_len;
df1ffd02
JH
2276 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2277 char *s1 = do_utf8 ?
2a782b5b 2278 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2279 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2280 locinput - pref_len + pref0_len;
df1ffd02
JH
2281 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2282 char *s2 = do_utf8 ?
2a782b5b 2283 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2284 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2285 locinput;
df1ffd02 2286 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2287 PerlIO_printf(Perl_debug_log,
2288 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2289 (IV)(locinput - PL_bostr),
2290 PL_colors[4],
2291 len0, s0,
2292 PL_colors[5],
2293 PL_colors[2],
2294 len1, s1,
2295 PL_colors[3],
2296 (docolor ? "" : "> <"),
2297 PL_colors[0],
2298 len2, s2,
2299 PL_colors[1],
2300 15 - l - pref_len + 1,
2301 "",
2302 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2303 SvPVX(prop));
2304 }
2305 });
d6a28714
JH
2306
2307 next = scan + NEXT_OFF(scan);
2308 if (next == scan)
2309 next = NULL;
2310
2311 switch (OP(scan)) {
2312 case BOL:
12d33761
HS
2313 if (locinput == PL_bostr || (PL_multiline &&
2314 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2315 {
2316 /* regtill = regbol; */
b8c5462f
JH
2317 break;
2318 }
d6a28714
JH
2319 sayNO;
2320 case MBOL:
12d33761
HS
2321 if (locinput == PL_bostr ||
2322 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2323 {
b8c5462f
JH
2324 break;
2325 }
d6a28714
JH
2326 sayNO;
2327 case SBOL:
c2a73568 2328 if (locinput == PL_bostr)
b8c5462f 2329 break;
d6a28714
JH
2330 sayNO;
2331 case GPOS:
2332 if (locinput == PL_reg_ganch)
2333 break;
2334 sayNO;
2335 case EOL:
2336 if (PL_multiline)
2337 goto meol;
2338 else
2339 goto seol;
2340 case MEOL:
2341 meol:
2342 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2343 sayNO;
b8c5462f 2344 break;
d6a28714
JH
2345 case SEOL:
2346 seol:
2347 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2348 sayNO;
d6a28714 2349 if (PL_regeol - locinput > 1)
b8c5462f 2350 sayNO;
b8c5462f 2351 break;
d6a28714
JH
2352 case EOS:
2353 if (PL_regeol != locinput)
b8c5462f 2354 sayNO;
d6a28714 2355 break;
ffc61ed2 2356 case SANY:
d6a28714 2357 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2358 sayNO;
f33976b4
DB
2359 if (do_utf8) {
2360 locinput += PL_utf8skip[nextchr];
2361 if (locinput > PL_regeol)
2362 sayNO;
2363 nextchr = UCHARAT(locinput);
2364 }
2365 else
2366 nextchr = UCHARAT(++locinput);
2367 break;
2368 case CANY:
2369 if (!nextchr && locinput >= PL_regeol)
2370 sayNO;
b8c5462f 2371 nextchr = UCHARAT(++locinput);
a0d0e21e 2372 break;
ffc61ed2 2373 case REG_ANY:
1aa99e6b
IH
2374 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2375 sayNO;
2376 if (do_utf8) {
b8c5462f 2377 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2378 if (locinput > PL_regeol)
2379 sayNO;
a0ed51b3 2380 nextchr = UCHARAT(locinput);
a0ed51b3 2381 }
1aa99e6b
IH
2382 else
2383 nextchr = UCHARAT(++locinput);
a0ed51b3 2384 break;
d6a28714 2385 case EXACT:
cd439c50
IZ
2386 s = STRING(scan);
2387 ln = STR_LEN(scan);
eb160463 2388 if (do_utf8 != UTF) {
bc517b45 2389 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2390 char *l = locinput;
2391 char *e = s + ln;
bc517b45 2392 STRLEN ulen;
a72c7584 2393
5ff6fc6d
JH
2394 if (do_utf8) {
2395 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2396 while (s < e) {
2397 if (l >= PL_regeol)
5ff6fc6d
JH
2398 sayNO;
2399 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2400 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2401 ckWARN(WARN_UTF8) ?
2402 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2403 sayNO;
bc517b45 2404 l += ulen;
5ff6fc6d 2405 s ++;
1aa99e6b 2406 }
5ff6fc6d
JH
2407 }
2408 else {
2409 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2410 while (s < e) {
2411 if (l >= PL_regeol)
2412 sayNO;
5ff6fc6d 2413 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2414 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2415 ckWARN(WARN_UTF8) ?
2416 0 : UTF8_ALLOW_ANY))
1aa99e6b 2417 sayNO;
bc517b45 2418 s += ulen;
a72c7584 2419 l ++;
1aa99e6b 2420 }
5ff6fc6d 2421 }
1aa99e6b
IH
2422 locinput = l;
2423 nextchr = UCHARAT(locinput);
2424 break;
2425 }
bc517b45 2426 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2427 /* Inline the first character, for speed. */
2428 if (UCHARAT(s) != nextchr)
2429 sayNO;
2430 if (PL_regeol - locinput < ln)
2431 sayNO;
2432 if (ln > 1 && memNE(s, locinput, ln))
2433 sayNO;
2434 locinput += ln;
2435 nextchr = UCHARAT(locinput);
2436 break;
2437 case EXACTFL:
b8c5462f
JH
2438 PL_reg_flags |= RF_tainted;
2439 /* FALL THROUGH */
d6a28714 2440 case EXACTF:
cd439c50
IZ
2441 s = STRING(scan);
2442 ln = STR_LEN(scan);
d6a28714 2443
d07ddd77
JH
2444 if (do_utf8 || UTF) {
2445 /* Either target or the pattern are utf8. */
d6a28714 2446 char *l = locinput;
d07ddd77 2447 char *e = PL_regeol;
bc517b45 2448
eb160463 2449 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2450 l, &e, 0, do_utf8)) {
5486206c
JH
2451 /* One more case for the sharp s:
2452 * pack("U0U*", 0xDF) =~ /ss/i,
2453 * the 0xC3 0x9F are the UTF-8
2454 * byte sequence for the U+00DF. */
2455 if (!(do_utf8 &&
2456 toLOWER(s[0]) == 's' &&
2457 ln >= 2 &&
2458 toLOWER(s[1]) == 's' &&
2459 (U8)l[0] == 0xC3 &&
2460 e - l >= 2 &&
2461 (U8)l[1] == 0x9F))
2462 sayNO;
2463 }
d07ddd77
JH
2464 locinput = e;
2465 nextchr = UCHARAT(locinput);
2466 break;
a0ed51b3 2467 }
d6a28714 2468
bc517b45
JH
2469 /* Neither the target and the pattern are utf8. */
2470
d6a28714
JH
2471 /* Inline the first character, for speed. */
2472 if (UCHARAT(s) != nextchr &&
2473 UCHARAT(s) != ((OP(scan) == EXACTF)
2474 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2475 sayNO;
d6a28714 2476 if (PL_regeol - locinput < ln)
b8c5462f 2477 sayNO;
d6a28714
JH
2478 if (ln > 1 && (OP(scan) == EXACTF
2479 ? ibcmp(s, locinput, ln)
2480 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2481 sayNO;
d6a28714
JH
2482 locinput += ln;
2483 nextchr = UCHARAT(locinput);
a0d0e21e 2484 break;
d6a28714 2485 case ANYOF:
ffc61ed2 2486 if (do_utf8) {
9e55ce06
JH
2487 STRLEN inclasslen = PL_regeol - locinput;
2488
ba7b4546 2489 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2490 sayNO_ANYOF;
ffc61ed2
JH
2491 if (locinput >= PL_regeol)
2492 sayNO;
0f0076b4 2493 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2494 nextchr = UCHARAT(locinput);
e0f9d4a8 2495 break;
ffc61ed2
JH
2496 }
2497 else {
2498 if (nextchr < 0)
2499 nextchr = UCHARAT(locinput);
7d3e948e 2500 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2501 sayNO_ANYOF;
ffc61ed2
JH
2502 if (!nextchr && locinput >= PL_regeol)
2503 sayNO;
2504 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2505 break;
2506 }
2507 no_anyof:
2508 /* If we might have the case of the German sharp s
2509 * in a casefolding Unicode character class. */
2510
ebc501f0
JH
2511 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2512 locinput += SHARP_S_SKIP;
e0f9d4a8 2513 nextchr = UCHARAT(locinput);
ffc61ed2 2514 }
e0f9d4a8
JH
2515 else
2516 sayNO;
b8c5462f 2517 break;
d6a28714 2518 case ALNUML:
b8c5462f
JH
2519 PL_reg_flags |= RF_tainted;
2520 /* FALL THROUGH */
d6a28714 2521 case ALNUM:
b8c5462f 2522 if (!nextchr)
4633a7c4 2523 sayNO;
ffc61ed2 2524 if (do_utf8) {
ad24be35 2525 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2526 if (!(OP(scan) == ALNUM
3568d838 2527 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2528 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2529 {
2530 sayNO;
a0ed51b3 2531 }
b8c5462f 2532 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2533 nextchr = UCHARAT(locinput);
2534 break;
2535 }
ffc61ed2 2536 if (!(OP(scan) == ALNUM
d6a28714 2537 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2538 sayNO;
b8c5462f 2539 nextchr = UCHARAT(++locinput);
a0d0e21e 2540 break;
d6a28714 2541 case NALNUML:
b8c5462f
JH
2542 PL_reg_flags |= RF_tainted;
2543 /* FALL THROUGH */
d6a28714
JH
2544 case NALNUM:
2545 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2546 sayNO;
ffc61ed2 2547 if (do_utf8) {
8269fa76 2548 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2549 if (OP(scan) == NALNUM
3568d838 2550 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2551 : isALNUM_LC_utf8((U8*)locinput))
2552 {
b8c5462f 2553 sayNO;
d6a28714 2554 }
b8c5462f
JH
2555 locinput += PL_utf8skip[nextchr];
2556 nextchr = UCHARAT(locinput);
2557 break;
2558 }
ffc61ed2 2559 if (OP(scan) == NALNUM
d6a28714 2560 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2561 sayNO;
76e3520e 2562 nextchr = UCHARAT(++locinput);
a0d0e21e 2563 break;
d6a28714
JH
2564 case BOUNDL:
2565 case NBOUNDL:
3280af22 2566 PL_reg_flags |= RF_tainted;
bbce6d69 2567 /* FALL THROUGH */
d6a28714
JH
2568 case BOUND:
2569 case NBOUND:
2570 /* was last char in word? */
ffc61ed2 2571 if (do_utf8) {
12d33761
HS
2572 if (locinput == PL_bostr)
2573 ln = '\n';
ffc61ed2 2574 else {
b4f7163a 2575 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2576
b4f7163a 2577 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2578 }
2579 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2580 ln = isALNUM_uni(ln);
8269fa76 2581 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2582 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2583 }
2584 else {
9041c2e3 2585 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2586 n = isALNUM_LC_utf8((U8*)locinput);
2587 }
a0ed51b3 2588 }
d6a28714 2589 else {
12d33761
HS
2590 ln = (locinput != PL_bostr) ?
2591 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2592 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2593 ln = isALNUM(ln);
2594 n = isALNUM(nextchr);
2595 }
2596 else {
2597 ln = isALNUM_LC(ln);
2598 n = isALNUM_LC(nextchr);
2599 }
d6a28714 2600 }
ffc61ed2
JH
2601 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2602 OP(scan) == BOUNDL))
2603 sayNO;
a0ed51b3 2604 break;
d6a28714 2605 case SPACEL:
3280af22 2606 PL_reg_flags |= RF_tainted;
bbce6d69 2607 /* FALL THROUGH */
d6a28714 2608 case SPACE:
9442cb0e 2609 if (!nextchr)
4633a7c4 2610 sayNO;
1aa99e6b 2611 if (do_utf8) {
fd400ab9 2612 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2613 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2614 if (!(OP(scan) == SPACE
3568d838 2615 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2616 : isSPACE_LC_utf8((U8*)locinput)))
2617 {
2618 sayNO;
2619 }
2620 locinput += PL_utf8skip[nextchr];
2621 nextchr = UCHARAT(locinput);
2622 break;
d6a28714 2623 }
ffc61ed2
JH
2624 if (!(OP(scan) == SPACE
2625 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2626 sayNO;
2627 nextchr = UCHARAT(++locinput);
2628 }
2629 else {
2630 if (!(OP(scan) == SPACE
2631 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2632 sayNO;
2633 nextchr = UCHARAT(++locinput);
a0ed51b3 2634 }
a0ed51b3 2635 break;
d6a28714 2636 case NSPACEL:
3280af22 2637 PL_reg_flags |= RF_tainted;
bbce6d69 2638 /* FALL THROUGH */
d6a28714 2639 case NSPACE:
9442cb0e 2640 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2641 sayNO;
1aa99e6b 2642 if (do_utf8) {
8269fa76 2643 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2644 if (OP(scan) == NSPACE
3568d838 2645 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2646 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2647 {
2648 sayNO;
2649 }
2650 locinput += PL_utf8skip[nextchr];
2651 nextchr = UCHARAT(locinput);
2652 break;
a0ed51b3 2653 }
ffc61ed2 2654 if (OP(scan) == NSPACE
d6a28714 2655 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2656 sayNO;
76e3520e 2657 nextchr = UCHARAT(++locinput);
a0d0e21e 2658 break;
d6a28714 2659 case DIGITL:
a0ed51b3
LW
2660 PL_reg_flags |= RF_tainted;
2661 /* FALL THROUGH */
d6a28714 2662 case DIGIT:
9442cb0e 2663 if (!nextchr)
a0ed51b3 2664 sayNO;
1aa99e6b 2665 if (do_utf8) {
8269fa76 2666 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2667 if (!(OP(scan) == DIGIT
3568d838 2668 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2669 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2670 {
a0ed51b3 2671 sayNO;
dfe13c55 2672 }
6f06b55f 2673 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2674 nextchr = UCHARAT(locinput);
2675 break;
2676 }
ffc61ed2 2677 if (!(OP(scan) == DIGIT
9442cb0e 2678 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2679 sayNO;
2680 nextchr = UCHARAT(++locinput);
2681 break;
d6a28714 2682 case NDIGITL:
b8c5462f
JH
2683 PL_reg_flags |= RF_tainted;
2684 /* FALL THROUGH */
d6a28714 2685 case NDIGIT:
9442cb0e 2686 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2687 sayNO;
1aa99e6b 2688 if (do_utf8) {
8269fa76 2689 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2690 if (OP(scan) == NDIGIT
3568d838 2691 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2692 : isDIGIT_LC_utf8((U8*)locinput))
2693 {
a0ed51b3 2694 sayNO;
9442cb0e 2695 }
6f06b55f 2696 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2697 nextchr = UCHARAT(locinput);
2698 break;
2699 }
ffc61ed2 2700 if (OP(scan) == NDIGIT
9442cb0e 2701 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2702 sayNO;
2703 nextchr = UCHARAT(++locinput);
2704 break;
2705 case CLUMP:
b7c83a7e 2706 if (locinput >= PL_regeol)
a0ed51b3 2707 sayNO;
b7c83a7e
JH
2708 if (do_utf8) {
2709 LOAD_UTF8_CHARCLASS(mark,"~");
2710 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2711 sayNO;
2712 locinput += PL_utf8skip[nextchr];
2713 while (locinput < PL_regeol &&
2714 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2715 locinput += UTF8SKIP(locinput);
2716 if (locinput > PL_regeol)
2717 sayNO;
eb08e2da
JH
2718 }
2719 else
2720 locinput++;
a0ed51b3
LW
2721 nextchr = UCHARAT(locinput);
2722 break;
c8756f30 2723 case REFFL:
3280af22 2724 PL_reg_flags |= RF_tainted;
c8756f30 2725 /* FALL THROUGH */
c277df42 2726 case REF:
c8756f30 2727 case REFF:
c277df42 2728 n = ARG(scan); /* which paren pair */
cf93c79d 2729 ln = PL_regstartp[n];
2c2d71f5 2730 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2731 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2732 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2733 if (ln == PL_regendp[n])
a0d0e21e 2734 break;
a0ed51b3 2735
cf93c79d 2736 s = PL_bostr + ln;
1aa99e6b 2737 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2738 char *l = locinput;
cf93c79d 2739 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2740 /*
2741 * Note that we can't do the "other character" lookup trick as
2742 * in the 8-bit case (no pun intended) because in Unicode we
2743 * have to map both upper and title case to lower case.
2744 */
2745 if (OP(scan) == REFF) {
a2a2844f 2746 STRLEN ulen1, ulen2;
e7ae6809
JH
2747 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2748 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2749 while (s < e) {
2750 if (l >= PL_regeol)
2751 sayNO;
a2a2844f
JH
2752 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2753 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2754 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2755 sayNO;
a2a2844f
JH
2756 s += ulen1;
2757 l += ulen2;
a0ed51b3
LW
2758 }
2759 }
2760 locinput = l;
2761 nextchr = UCHARAT(locinput);
2762 break;
2763 }
2764
a0d0e21e 2765 /* Inline the first character, for speed. */
76e3520e 2766 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2767 (OP(scan) == REF ||
2768 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2769 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2770 sayNO;
cf93c79d 2771 ln = PL_regendp[n] - ln;
3280af22 2772 if (locinput + ln > PL_regeol)
4633a7c4 2773 sayNO;
c8756f30
AK
2774 if (ln > 1 && (OP(scan) == REF
2775 ? memNE(s, locinput, ln)
2776 : (OP(scan) == REFF
2777 ? ibcmp(s, locinput, ln)
2778 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2779 sayNO;
a0d0e21e 2780 locinput += ln;
76e3520e 2781 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2782 break;
2783
2784 case NOTHING:
c277df42 2785 case TAIL:
a0d0e21e
LW
2786 break;
2787 case BACK:
2788 break;
c277df42
IZ
2789 case EVAL:
2790 {
2791 dSP;
533c011a 2792 OP_4tree *oop = PL_op;
3280af22
NIS
2793 COP *ocurcop = PL_curcop;
2794 SV **ocurpad = PL_curpad;
c277df42 2795 SV *ret;
9041c2e3 2796
c277df42 2797 n = ARG(scan);
533c011a 2798 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2799 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
dfad63ad 2800 PL_curpad = AvARRAY((AV*)PL_regdata->data[n + 2]);
cf93c79d 2801 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2802
8e5e9ebe
RGS
2803 {
2804 SV **before = SP;
2805 CALLRUNOPS(aTHX); /* Scalar context. */
2806 SPAGAIN;
2807 if (SP == before)
075aa684 2808 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2809 else {
2810 ret = POPs;
2811 PUTBACK;
2812 }
2813 }
2814
0f5d15d6
IZ
2815 PL_op = oop;
2816 PL_curpad = ocurpad;
2817 PL_curcop = ocurcop;
c277df42 2818 if (logical) {
0f5d15d6
IZ
2819 if (logical == 2) { /* Postponed subexpression. */
2820 regexp *re;
22c35a8c 2821 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2822 re_cc_state state;
0f5d15d6
IZ
2823 CHECKPOINT cp, lastcp;
2824
2825 if(SvROK(ret) || SvRMAGICAL(ret)) {
2826 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2827
2828 if(SvMAGICAL(sv))
14befaf4 2829 mg = mg_find(sv, PERL_MAGIC_qr);
0f5d15d6
IZ
2830 }
2831 if (mg) {
2832 re = (regexp *)mg->mg_obj;
df0003d4 2833 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2834 }
2835 else {
2836 STRLEN len;
2837 char *t = SvPV(ret, len);
2838 PMOP pm;
2839 char *oprecomp = PL_regprecomp;
2840 I32 osize = PL_regsize;
2841 I32 onpar = PL_regnpar;
2842
5fcd1c1b 2843 Zero(&pm, 1, PMOP);
cea2e8a9 2844 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2845 if (!(SvFLAGS(ret)
0f5d15d6 2846 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
14befaf4
DM
2847 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2848 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2849 PL_regprecomp = oprecomp;
2850 PL_regsize = osize;
2851 PL_regnpar = onpar;
2852 }
2853 DEBUG_r(
9041c2e3 2854 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2855 "Entering embedded `%s%.60s%s%s'\n",
2856 PL_colors[0],
2857 re->precomp,
2858 PL_colors[1],
2859 (strlen(re->precomp) > 60 ? "..." : ""))
2860 );
2861 state.node = next;
2862 state.prev = PL_reg_call_cc;
2863 state.cc = PL_regcc;
2864 state.re = PL_reg_re;
2865
2ab05381 2866 PL_regcc = 0;
9041c2e3 2867
0f5d15d6 2868 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2869 REGCP_SET(lastcp);
0f5d15d6
IZ
2870 cache_re(re);
2871 state.ss = PL_savestack_ix;
2872 *PL_reglastparen = 0;
a01268b5 2873 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2874 PL_reg_call_cc = &state;
2875 PL_reginput = locinput;
2c2d71f5
JH
2876
2877 /* XXXX This is too dramatic a measure... */
2878 PL_reg_maxiter = 0;
2879
0f5d15d6 2880 if (regmatch(re->program + 1)) {
2c914db6
IZ
2881 /* Even though we succeeded, we need to restore
2882 global variables, since we may be wrapped inside
2883 SUSPEND, thus the match may be not finished yet. */
2884
2885 /* XXXX Do this only if SUSPENDed? */
2886 PL_reg_call_cc = state.prev;
2887 PL_regcc = state.cc;
2888 PL_reg_re = state.re;
2889 cache_re(PL_reg_re);
2890
2891 /* XXXX This is too dramatic a measure... */
2892 PL_reg_maxiter = 0;
2893
2894 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2895 ReREFCNT_dec(re);
2896 regcpblow(cp);
2897 sayYES;
2898 }
0f5d15d6 2899 ReREFCNT_dec(re);
02db2b7b 2900 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2901 regcppop();
2902 PL_reg_call_cc = state.prev;
2903 PL_regcc = state.cc;
2904 PL_reg_re = state.re;
d3790889 2905 cache_re(PL_reg_re);
2c2d71f5
JH
2906
2907 /* XXXX This is too dramatic a measure... */
2908 PL_reg_maxiter = 0;
2909
8e514ae6 2910 logical = 0;
0f5d15d6
IZ
2911 sayNO;
2912 }
c277df42 2913 sw = SvTRUE(ret);
0f5d15d6 2914 logical = 0;
a0ed51b3
LW
2915 }
2916 else
3280af22 2917 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2918 break;
2919 }
a0d0e21e 2920 case OPEN:
c277df42 2921 n = ARG(scan); /* which paren pair */
3280af22
NIS
2922 PL_reg_start_tmp[n] = locinput;
2923 if (n > PL_regsize)
2924 PL_regsize = n;
a0d0e21e
LW
2925 break;
2926 case CLOSE:
c277df42 2927 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2928 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2929 PL_regendp[n] = locinput - PL_bostr;
eb160463 2930 if (n > (I32)*PL_reglastparen)
3280af22 2931 *PL_reglastparen = n;
a01268b5 2932 *PL_reglastcloseparen = n;
a0d0e21e 2933 break;
c277df42
IZ
2934 case GROUPP:
2935 n = ARG(scan); /* which paren pair */
eb160463 2936 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2937 break;
2938 case IFTHEN:
2c2d71f5 2939 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2940 if (sw)
2941 next = NEXTOPER(NEXTOPER(scan));
2942 else {
2943 next = scan + ARG(scan);
2944 if (OP(next) == IFTHEN) /* Fake one. */
2945 next = NEXTOPER(NEXTOPER(next));
2946 }
2947 break;
2948 case LOGICAL:
0f5d15d6 2949 logical = scan->flags;
c277df42 2950 break;
2ab05381
IZ
2951/*******************************************************************
2952 PL_regcc contains infoblock about the innermost (...)* loop, and
2953 a pointer to the next outer infoblock.
2954
2955 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2956
2957 1) After matching X, regnode for CURLYX is processed;
2958
9041c2e3 2959 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
2960 regmatch() recursively with the starting point at WHILEM node;
2961
2962 3) Each hit of WHILEM node tries to match A and Z (in the order
2963 depending on the current iteration, min/max of {min,max} and
2964 greediness). The information about where are nodes for "A"
2965 and "Z" is read from the infoblock, as is info on how many times "A"
2966 was already matched, and greediness.
2967
2968 4) After A matches, the same WHILEM node is hit again.
2969
2970 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2971 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2972 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2973 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2974 of the external loop.
2975
2976 Currently present infoblocks form a tree with a stem formed by PL_curcc
2977 and whatever it mentions via ->next, and additional attached trees
2978 corresponding to temporarily unset infoblocks as in "5" above.
2979
9041c2e3 2980 In the following picture infoblocks for outer loop of
2ab05381
IZ
2981 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2982 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2983 infoblocks are drawn below the "reset" infoblock.
2984
2985 In fact in the picture below we do not show failed matches for Z and T
2986 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2987 more obvious *why* one needs to *temporary* unset infoblocks.]
2988
2989 Matched REx position InfoBlocks Comment
2990 (Y(A)*?Z)*?T x
2991 Y(A)*?Z)*?T x <- O
2992 Y (A)*?Z)*?T x <- O
2993 Y A)*?Z)*?T x <- O <- I
2994 YA )*?Z)*?T x <- O <- I
2995 YA A)*?Z)*?T x <- O <- I
2996 YAA )*?Z)*?T x <- O <- I
2997 YAA Z)*?T x <- O # Temporary unset I
2998 I
2999
3000 YAAZ Y(A)*?Z)*?T x <- O
3001 I
3002
3003 YAAZY (A)*?Z)*?T x <- O
3004 I
3005
3006 YAAZY A)*?Z)*?T x <- O <- I
3007 I
3008
3009 YAAZYA )*?Z)*?T x <- O <- I
3010 I
3011
3012 YAAZYA Z)*?T x <- O # Temporary unset I
3013 I,I
3014
3015 YAAZYAZ )*?T x <- O
3016 I,I
3017
3018 YAAZYAZ T x # Temporary unset O
3019 O
3020 I,I
3021
3022 YAAZYAZT x
3023 O
3024 I,I
3025 *******************************************************************/
a0d0e21e
LW
3026 case CURLYX: {
3027 CURCUR cc;
3280af22 3028 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3029 /* No need to save/restore up to this paren */
3030 I32 parenfloor = scan->flags;
c277df42
IZ
3031
3032 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3033 next += ARG(next);
3280af22
NIS
3034 cc.oldcc = PL_regcc;
3035 PL_regcc = &cc;
cb434fcc
IZ
3036 /* XXXX Probably it is better to teach regpush to support
3037 parenfloor > PL_regsize... */
eb160463 3038 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3039 parenfloor = *PL_reglastparen; /* Pessimization... */
3040 cc.parenfloor = parenfloor;
a0d0e21e
LW
3041 cc.cur = -1;
3042 cc.min = ARG1(scan);
3043 cc.max = ARG2(scan);
c277df42 3044 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3045 cc.next = next;
3046 cc.minmod = minmod;
3047 cc.lastloc = 0;
3280af22 3048 PL_reginput = locinput;
a0d0e21e
LW
3049 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3050 regcpblow(cp);
3280af22 3051 PL_regcc = cc.oldcc;
4633a7c4 3052 saySAME(n);
a0d0e21e
LW
3053 }
3054 /* NOT REACHED */
3055 case WHILEM: {
3056 /*
3057 * This is really hard to understand, because after we match
3058 * what we're trying to match, we must make sure the rest of
2c2d71f5 3059 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3060 * to go back UP the parse tree by recursing ever deeper. And
3061 * if it fails, we have to reset our parent's current state
3062 * that we can try again after backing off.
3063 */
3064
c277df42 3065 CHECKPOINT cp, lastcp;
3280af22 3066 CURCUR* cc = PL_regcc;
c277df42
IZ
3067 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3068
4633a7c4 3069 n = cc->cur + 1; /* how many we know we matched */
3280af22 3070 PL_reginput = locinput;
a0d0e21e 3071
c277df42 3072 DEBUG_r(
9041c2e3 3073 PerlIO_printf(Perl_debug_log,
91f3b821 3074 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3075 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3076 (long)n, (long)cc->min,
2797576d 3077 (long)cc->max, PTR2UV(cc))
c277df42 3078 );
4633a7c4 3079
a0d0e21e
LW
3080 /* If degenerate scan matches "", assume scan done. */
3081
579cf2c3 3082 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3083 PL_regcc = cc->oldcc;
2ab05381
IZ
3084 if (PL_regcc)
3085 ln = PL_regcc->cur;
c277df42 3086 DEBUG_r(
c3464db5
DD
3087 PerlIO_printf(Perl_debug_log,
3088 "%*s empty match detected, try continuation...\n",
3280af22 3089 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3090 );
a0d0e21e 3091 if (regmatch(cc->next))
4633a7c4 3092 sayYES;
2ab05381
IZ
3093 if (PL_regcc)
3094 PL_regcc->cur = ln;
3280af22 3095 PL_regcc = cc;
4633a7c4 3096 sayNO;
a0d0e21e
LW
3097 }
3098
3099 /* First just match a string of min scans. */
3100
3101 if (n < cc->min) {
3102 cc->cur = n;
3103 cc->lastloc = locinput;
4633a7c4
LW
3104 if (regmatch(cc->scan))
3105 sayYES;
3106 cc->cur = n - 1;
c277df42 3107 cc->lastloc = lastloc;
4633a7c4 3108 sayNO;
a0d0e21e
LW
3109 }
3110
2c2d71f5
JH
3111 if (scan->flags) {
3112 /* Check whether we already were at this position.
3113 Postpone detection until we know the match is not
3114 *that* much linear. */
3115 if (!PL_reg_maxiter) {
3116 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3117 PL_reg_leftiter = PL_reg_maxiter;
3118 }
3119 if (PL_reg_leftiter-- == 0) {
3120 I32 size = (PL_reg_maxiter + 7)/8;
3121 if (PL_reg_poscache) {
eb160463 3122 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3123 Renew(PL_reg_poscache, size, char);
3124 PL_reg_poscache_size = size;
3125 }
3126 Zero(PL_reg_poscache, size, char);
3127 }
3128 else {
3129 PL_reg_poscache_size = size;
3130 Newz(29, PL_reg_poscache, size, char);
3131 }
3132 DEBUG_r(
3133 PerlIO_printf(Perl_debug_log,
3134 "%sDetected a super-linear match, switching on caching%s...\n",
3135 PL_colors[4], PL_colors[5])
3136 );
3137 }
3138 if (PL_reg_leftiter < 0) {
3139 I32 o = locinput - PL_bostr, b;
3140
3141 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3142 b = o % 8;
3143 o /= 8;
3144 if (PL_reg_poscache[o] & (1<<b)) {
3145 DEBUG_r(
3146 PerlIO_printf(Perl_debug_log,
3147 "%*s already tried at this position...\n",
3148 REPORT_CODE_OFF+PL_regindent*2, "")
3149 );
7821416a 3150 sayNO_SILENT;
2c2d71f5
JH
3151 }
3152 PL_reg_poscache[o] |= (1<<b);
3153 }
3154 }
3155
a0d0e21e
LW
3156 /* Prefer next over scan for minimal matching. */
3157
3158 if (cc->minmod) {
3280af22 3159 PL_regcc = cc->oldcc;
2ab05381
IZ
3160 if (PL_regcc)
3161 ln = PL_regcc->cur;
5f05dabc 3162 cp = regcppush(cc->parenfloor);
02db2b7b 3163 REGCP_SET(lastcp);
5f05dabc 3164 if (regmatch(cc->next)) {
c277df42 3165 regcpblow(cp);
4633a7c4 3166 sayYES; /* All done. */
5f05dabc 3167 }
02db2b7b 3168 REGCP_UNWIND(lastcp);
5f05dabc 3169 regcppop();
2ab05381
IZ
3170 if (PL_regcc)
3171 PL_regcc->cur = ln;
3280af22 3172 PL_regcc = cc;
a0d0e21e 3173
c277df42 3174 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3175 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3176 && !(PL_reg_flags & RF_warned)) {
3177 PL_reg_flags |= RF_warned;
9014280d 3178 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3179 "Complex regular subexpression recursion",
3180 REG_INFTY - 1);
c277df42 3181 }
4633a7c4 3182 sayNO;
c277df42 3183 }
a687059c 3184
c277df42 3185 DEBUG_r(
c3464db5
DD
3186 PerlIO_printf(Perl_debug_log,
3187 "%*s trying longer...\n",
3280af22 3188 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3189 );
a0d0e21e 3190 /* Try scanning more and see if it helps. */
3280af22 3191 PL_reginput = locinput;
a0d0e21e
LW
3192 cc->cur = n;
3193 cc->lastloc = locinput;
5f05dabc 3194 cp = regcppush(cc->parenfloor);
02db2b7b 3195 REGCP_SET(lastcp);
5f05dabc 3196 if (regmatch(cc->scan)) {
c277df42 3197 regcpblow(cp);
4633a7c4 3198 sayYES;
5f05dabc 3199 }
02db2b7b 3200 REGCP_UNWIND(lastcp);
5f05dabc 3201 regcppop();
4633a7c4 3202 cc->cur = n - 1;
c277df42 3203 cc->lastloc = lastloc;
4633a7c4 3204 sayNO;
a0d0e21e
LW
3205 }
3206
3207 /* Prefer scan over next for maximal matching. */
3208
3209 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3210 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3211 cc->cur = n;
3212 cc->lastloc = locinput;
02db2b7b 3213 REGCP_SET(lastcp);
5f05dabc 3214 if (regmatch(cc->scan)) {
c277df42 3215 regcpblow(cp);
4633a7c4 3216 sayYES;
5f05dabc 3217 }
02db2b7b 3218 REGCP_UNWIND(lastcp);
a0d0e21e 3219 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3220 PL_reginput = locinput;
c277df42 3221 DEBUG_r(
c3464db5
DD
3222 PerlIO_printf(Perl_debug_log,
3223 "%*s failed, try continuation...\n",
3280af22 3224 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3225 );
3226 }
9041c2e3 3227 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3228 && !(PL_reg_flags & RF_warned)) {
3280af22 3229 PL_reg_flags |= RF_warned;
9014280d 3230 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3231 "Complex regular subexpression recursion",
3232 REG_INFTY - 1);
a0d0e21e
LW
3233 }
3234
3235 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3236 PL_regcc = cc->oldcc;
2ab05381
IZ
3237 if (PL_regcc)
3238 ln = PL_regcc->cur;
a0d0e21e 3239 if (regmatch(cc->next))
4633a7c4 3240 sayYES;
2ab05381
IZ
3241 if (PL_regcc)
3242 PL_regcc->cur = ln;
3280af22 3243 PL_regcc = cc;
4633a7c4 3244 cc->cur = n - 1;
c277df42 3245 cc->lastloc = lastloc;
4633a7c4 3246 sayNO;
a0d0e21e
LW
3247 }
3248 /* NOT REACHED */
9041c2e3 3249 case BRANCHJ:
c277df42
IZ
3250 next = scan + ARG(scan);
3251 if (next == scan)
3252 next = NULL;
3253 inner = NEXTOPER(NEXTOPER(scan));
3254 goto do_branch;
9041c2e3 3255 case BRANCH:
c277df42
IZ
3256 inner = NEXTOPER(scan);
3257 do_branch:
3258 {
c277df42
IZ
3259 c1 = OP(scan);
3260 if (OP(next) != c1) /* No choice. */
3261 next = inner; /* Avoid recursion. */
a0d0e21e 3262 else {
02db2b7b
IZ
3263 I32 lastparen = *PL_reglastparen;
3264 I32 unwind1;
3265 re_unwind_branch_t *uw;
3266
3267 /* Put unwinding data on stack */
3268 unwind1 = SSNEWt(1,re_unwind_branch_t);
3269 uw = SSPTRt(unwind1,re_unwind_branch_t);
3270 uw->prev = unwind;
3271 unwind = unwind1;
3272 uw->type = ((c1 == BRANCH)
3273 ? RE_UNWIND_BRANCH
3274 : RE_UNWIND_BRANCHJ);
3275 uw->lastparen = lastparen;
3276 uw->next = next;
3277 uw->locinput = locinput;
3278 uw->nextchr = nextchr;
3279#ifdef DEBUGGING
3280 uw->regindent = ++PL_regindent;
3281#endif
c277df42 3282
02db2b7b
IZ
3283 REGCP_SET(uw->lastcp);
3284
3285 /* Now go into the first branch */
3286 next = inner;
a687059c 3287 }
a0d0e21e
LW
3288 }
3289 break;
3290 case MINMOD:
3291 minmod = 1;
3292 break;
c277df42
IZ
3293 case CURLYM:
3294 {
00db4c45 3295 I32 l = 0;
c277df42 3296 CHECKPOINT lastcp;
9041c2e3 3297
c277df42
IZ
3298 /* We suppose that the next guy does not need
3299 backtracking: in particular, it is of constant length,
3300 and has no parenths to influence future backrefs. */
3301 ln = ARG1(scan); /* min to match */
3302 n = ARG2(scan); /* max to match */
c277df42
IZ
3303 paren = scan->flags;
3304 if (paren) {
3280af22
NIS
3305 if (paren > PL_regsize)
3306 PL_regsize = paren;
eb160463 3307 if (paren > (I32)*PL_reglastparen)
3280af22 3308 *PL_reglastparen = paren;
c277df42 3309 }
dc45a647 3310 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3311 if (paren)
3312 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3313 PL_reginput = locinput;
c277df42
IZ
3314 if (minmod) {
3315 minmod = 0;
3316 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3317 sayNO;
f31a99c8
HS
3318 /* if we matched something zero-length we don't need to
3319 backtrack - capturing parens are already defined, so
3320 the caveat in the maximal case doesn't apply
3321
3322 XXXX if ln == 0, we can redo this check first time
3323 through the following loop
3324 */
3325 if (ln && l == 0)
3326 n = ln; /* don't backtrack */
3280af22 3327 locinput = PL_reginput;
cca55fe3 3328 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3329 regnode *text_node = next;
3330
cca55fe3 3331 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3332
cca55fe3 3333 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3334 else {
cca55fe3
JP
3335 if (PL_regkind[(U8)OP(text_node)] == REF) {
3336 I32 n, ln;
3337 n = ARG(text_node); /* which paren pair */
3338 ln = PL_regstartp[n];
3339 /* assume yes if we haven't seen CLOSEn */
3340 if (
eb160463 3341 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3342 ln == -1 ||
3343 ln == PL_regendp[n]
3344 ) {
3345 c1 = c2 = -1000;
3346 goto assume_ok_MM;
3347 }
3348 c1 = *(PL_bostr + ln);
3349 }
3350 else { c1 = (U8)*STRING(text_node); }
af5decee 3351 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3352 c2 = PL_fold[c1];
af5decee 3353 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3354 c2 = PL_fold_locale[c1];
3355 else
3356 c2 = c1;
3357 }
a0ed51b3
LW
3358 }
3359 else
c277df42 3360 c1 = c2 = -1000;
cca55fe3 3361 assume_ok_MM:
02db2b7b 3362 REGCP_SET(lastcp);
5f4b28b2 3363 /* This may be improved if l == 0. */
c277df42
IZ
3364 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3365 /* If it could work, try it. */
3366 if (c1 == -1000 ||
3280af22
NIS
3367 UCHARAT(PL_reginput) == c1 ||
3368 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3369 {
3370 if (paren) {
f31a99c8 3371 if (ln) {
cf93c79d
IZ
3372 PL_regstartp[paren] =
3373 HOPc(PL_reginput, -l) - PL_bostr;
3374 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3375 }
3376 else
cf93c79d 3377 PL_regendp[paren] = -1;
c277df42
IZ
3378 }
3379 if (regmatch(next))
3380 sayYES;
02db2b7b 3381 REGCP_UNWIND(lastcp);
c277df42
IZ
3382 }
3383 /* Couldn't or didn't -- move forward. */
3280af22 3384 PL_reginput = locinput;
c277df42
IZ
3385 if (regrepeat_hard(scan, 1, &l)) {
3386 ln++;
3280af22 3387 locinput = PL_reginput;
c277df42
IZ
3388 }
3389 else
3390 sayNO;
3391 }
a0ed51b3
LW
3392 }
3393 else {
c277df42 3394 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3395 /* if we matched something zero-length we don't need to
3396 backtrack, unless the minimum count is zero and we
3397 are capturing the result - in that case the capture
3398 being defined or not may affect later execution
3399 */
3400 if (n != 0 && l == 0 && !(paren && ln == 0))
3401 ln = n; /* don't backtrack */
3280af22 3402 locinput = PL_reginput;
c277df42 3403 DEBUG_r(
5c0ca799 3404 PerlIO_printf(Perl_debug_log,
faccc32b 3405 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3406 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3407 (IV) n, (IV)l)
c277df42
IZ
3408 );
3409 if (n >= ln) {
cca55fe3 3410 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3411 regnode *text_node = next;
3412
cca55fe3 3413 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3414
cca55fe3 3415 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3416 else {
cca55fe3
JP
3417 if (PL_regkind[(U8)OP(text_node)] == REF) {
3418 I32 n, ln;
3419 n = ARG(text_node); /* which paren pair */
3420 ln = PL_regstartp[n];
3421 /* assume yes if we haven't seen CLOSEn */
3422 if (
eb160463 3423 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3424 ln == -1 ||
3425 ln == PL_regendp[n]
3426 ) {
3427 c1 = c2 = -1000;
3428 goto assume_ok_REG;
3429 }
3430 c1 = *(PL_bostr + ln);
3431 }
3432 else { c1 = (U8)*STRING(text_node); }
3433
af5decee 3434 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3435 c2 = PL_fold[c1];
af5decee 3436 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3437 c2 = PL_fold_locale[c1];
3438 else
3439 c2 = c1;
3440 }
a0ed51b3
LW
3441 }
3442 else
c277df42
IZ
3443 c1 = c2 = -1000;
3444 }
cca55fe3 3445 assume_ok_REG:
02db2b7b 3446 REGCP_SET(lastcp);
c277df42
IZ
3447 while (n >= ln) {
3448 /* If it could work, try it. */
3449 if (c1 == -1000 ||
3280af22
NIS
3450 UCHARAT(PL_reginput) == c1 ||
3451 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3452 {
3453 DEBUG_r(
c3464db5 3454 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3455 "%*s trying tail with n=%"IVdf"...\n",
3456 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3457 );
3458 if (paren) {
3459 if (n) {
cf93c79d
IZ
3460 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3461 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3462 }
a0ed51b3 3463 else
cf93c79d 3464 PL_regendp[paren] = -1;
c277df42 3465 }
a0ed51b3
LW
3466 if (regmatch(next))
3467 sayYES;
02db2b7b 3468 REGCP_UNWIND(lastcp);
a0ed51b3 3469 }
c277df42
IZ
3470 /* Couldn't or didn't -- back up. */
3471 n--;
dfe13c55 3472 locinput = HOPc(locinput, -l);
3280af22 3473 PL_reginput = locinput;
c277df42
IZ
3474 }
3475 }
3476 sayNO;
3477 break;
3478 }
3479 case CURLYN:
3480 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3481 if (paren > PL_regsize)
3482 PL_regsize = paren;
eb160463 3483 if (paren > (I32)*PL_reglastparen)
3280af22 3484 *PL_reglastparen = paren;
c277df42
IZ
3485 ln = ARG1(scan); /* min to match */
3486 n = ARG2(scan); /* max to match */
dc45a647 3487 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3488 goto repeat;
a0d0e21e 3489 case CURLY:
c277df42 3490 paren = 0;
a0d0e21e
LW
3491 ln = ARG1(scan); /* min to match */
3492 n = ARG2(scan); /* max to match */
dc45a647 3493 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3494 goto repeat;
3495 case STAR:
3496 ln = 0;
c277df42 3497 n = REG_INFTY;
a0d0e21e 3498 scan = NEXTOPER(scan);
c277df42 3499 paren = 0;
a0d0e21e
LW
3500 goto repeat;
3501 case PLUS:
c277df42
IZ
3502 ln = 1;
3503 n = REG_INFTY;
3504 scan = NEXTOPER(scan);
3505 paren = 0;
3506 repeat:
a0d0e21e
LW
3507 /*
3508 * Lookahead to avoid useless match attempts
3509 * when we know what character comes next.
3510 */
5f80c4cf
JP
3511
3512 /*
3513 * Used to only do .*x and .*?x, but now it allows
3514 * for )'s, ('s and (?{ ... })'s to be in the way
3515 * of the quantifier and the EXACT-like node. -- japhy
3516 */
3517
cca55fe3 3518 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3519 U8 *s;
3520 regnode *text_node = next;
3521
cca55fe3 3522 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3523
cca55fe3 3524 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3525 else {
cca55fe3
JP
3526 if (PL_regkind[(U8)OP(text_node)] == REF) {
3527 I32 n, ln;
3528 n = ARG(text_node); /* which paren pair */
3529 ln = PL_regstartp[n];
3530 /* assume yes if we haven't seen CLOSEn */
3531 if (
eb160463 3532 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3533 ln == -1 ||
3534 ln == PL_regendp[n]
3535 ) {
3536 c1 = c2 = -1000;
3537 goto assume_ok_easy;
3538 }
9246c65e 3539 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3540 }
3541 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3542
3543 if (!UTF) {
3544 c2 = c1 = *s;
f65d3ee7 3545 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3546 c2 = PL_fold[c1];
f65d3ee7 3547 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3548 c2 = PL_fold_locale[c1];
1aa99e6b 3549 }
5f80c4cf 3550 else { /* UTF */
f65d3ee7 3551 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3552 STRLEN ulen1, ulen2;
e7ae6809
JH
3553 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3554 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3555
3556 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3557 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3558
872c91ae
JH
3559 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3560 ckWARN(WARN_UTF8) ?
3561 0 : UTF8_ALLOW_ANY);
3562 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3563 ckWARN(WARN_UTF8) ?
3564 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3565 }
3566 else {
872c91ae
JH
3567 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3568 ckWARN(WARN_UTF8) ?
3569 0 : UTF8_ALLOW_ANY);
5f80c4cf 3570 }
1aa99e6b
IH
3571 }
3572 }
bbce6d69 3573 }
a0d0e21e 3574 else
bbce6d69 3575 c1 = c2 = -1000;
cca55fe3 3576 assume_ok_easy:
3280af22 3577 PL_reginput = locinput;
a0d0e21e 3578 if (minmod) {
c277df42 3579 CHECKPOINT lastcp;
a0d0e21e
LW
3580 minmod = 0;
3581 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3582 sayNO;
a0ed51b3 3583 locinput = PL_reginput;
02db2b7b 3584 REGCP_SET(lastcp);
0fe9bf95 3585 if (c1 != -1000) {
1aa99e6b 3586 char *e; /* Should not check after this */
0fe9bf95 3587 char *old = locinput;
b2f2f093 3588 int count = 0;
0fe9bf95 3589
1aa99e6b 3590 if (n == REG_INFTY) {
0fe9bf95 3591 e = PL_regeol - 1;
1aa99e6b
IH
3592 if (do_utf8)
3593 while (UTF8_IS_CONTINUATION(*(U8*)e))
3594 e--;
3595 }
3596 else if (do_utf8) {
3597 int m = n - ln;
3598 for (e = locinput;
3599 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3600 e += UTF8SKIP(e);
3601 }
3602 else {
3603 e = locinput + n - ln;
3604 if (e >= PL_regeol)
3605 e = PL_regeol - 1;
3606 }
0fe9bf95
IZ
3607 while (1) {
3608 /* Find place 'next' could work */
1aa99e6b
IH
3609 if (!do_utf8) {
3610 if (c1 == c2) {
a8e8ab15
JH
3611 while (locinput <= e &&
3612 UCHARAT(locinput) != c1)
1aa99e6b
IH
3613 locinput++;
3614 } else {
9041c2e3 3615 while (locinput <= e
a8e8ab15
JH
3616 && UCHARAT(locinput) != c1
3617 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3618 locinput++;
3619 }
3620 count = locinput - old;
3621 }
3622 else {
3623 STRLEN len;
3624 if (c1 == c2) {
872c91ae
JH
3625 /* count initialised to
3626 * utf8_distance(old, locinput) */
b2f2f093 3627 while (locinput <= e &&
872c91ae
JH
3628 utf8n_to_uvchr((U8*)locinput,
3629 UTF8_MAXLEN, &len,
3630 ckWARN(WARN_UTF8) ?
eb160463 3631 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3632 locinput += len;
b2f2f093
JH
3633 count++;
3634 }
1aa99e6b 3635 } else {
872c91ae
JH
3636 /* count initialised to
3637 * utf8_distance(old, locinput) */
b2f2f093 3638 while (locinput <= e) {
872c91ae
JH
3639 UV c = utf8n_to_uvchr((U8*)locinput,
3640 UTF8_MAXLEN, &len,
3641 ckWARN(WARN_UTF8) ?
3642 0 : UTF8_ALLOW_ANY);
eb160463 3643 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3644 break;
b2f2f093
JH
3645 locinput += len;
3646 count++;
1aa99e6b
IH
3647 }
3648 }
0fe9bf95 3649 }
9041c2e3 3650 if (locinput > e)
0fe9bf95
IZ
3651 sayNO;
3652 /* PL_reginput == old now */
3653 if (locinput != old) {
3654 ln = 1; /* Did some */
1aa99e6b 3655 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3656 sayNO;
3657 }
3658 /* PL_reginput == locinput now */
29d1e993 3659 TRYPAREN(paren, ln, locinput);
0fe9bf95 3660 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3661 REGCP_UNWIND(lastcp);
0fe9bf95 3662 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3663 old = locinput;
3664 if (do_utf8)
3665 locinput += UTF8SKIP(locinput);
3666 else
3667 locinput++;
b2f2f093 3668 count = 1;
0fe9bf95
IZ
3669 }
3670 }
3671 else
c277df42 3672 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3673 UV c;
3674 if (c1 != -1000) {
3675 if (do_utf8)
872c91ae
JH
3676 c = utf8n_to_uvchr((U8*)PL_reginput,
3677 UTF8_MAXLEN, 0,
3678 ckWARN(WARN_UTF8) ?
3679 0 : UTF8_ALLOW_ANY);
1aa99e6b 3680 else
9041c2e3 3681 c = UCHARAT(PL_reginput);
2390ecbc 3682 /* If it could work, try it. */
eb160463 3683 if (c == (UV)c1 || c == (UV)c2)
2390ecbc
PP
3684 {
3685 TRYPAREN(paren, n, PL_reginput);
3686 REGCP_UNWIND(lastcp);
3687 }
1aa99e6b 3688 }
a0d0e21e 3689 /* If it could work, try it. */
2390ecbc 3690 else if (c1 == -1000)
bbce6d69 3691 {
29d1e993 3692 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3693 REGCP_UNWIND(lastcp);
bbce6d69 3694 }
c277df42 3695 /* Couldn't or didn't -- move forward. */
a0ed51b3 3696 PL_reginput = locinput;
a0d0e21e
LW
3697 if (regrepeat(scan, 1)) {
3698 ln++;
a0ed51b3
LW
3699 locinput = PL_reginput;
3700 }
3701 else
4633a7c4 3702 sayNO;
a0d0e21e
LW
3703 }
3704 }
3705 else {
c277df42 3706 CHECKPOINT lastcp;
a0d0e21e 3707 n = regrepeat(scan, n);
a0ed51b3 3708 locinput = PL_reginput;
22c35a8c 3709 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3710 ((!PL_multiline && OP(next) != MEOL) ||
3711 OP(next) == SEOL || OP(next) == EOS))
3712 {
a0d0e21e 3713 ln = n; /* why back off? */
1aeab75a
GS
3714 /* ...because $ and \Z can match before *and* after
3715 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3716 We should back off by one in this case. */
3717 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3718 ln--;
3719 }
02db2b7b 3720 REGCP_SET(lastcp);
c277df42 3721 if (paren) {
8fa7f367 3722 UV c = 0;
c277df42 3723 while (n >= ln) {
1aa99e6b
IH
3724 if (c1 != -1000) {
3725 if (do_utf8)
872c91ae
JH
3726 c = utf8n_to_uvchr((U8*)PL_reginput,
3727 UTF8_MAXLEN, 0,
3728 ckWARN(WARN_UTF8) ?
3729 0 : UTF8_ALLOW_ANY);
1aa99e6b 3730 else
9041c2e3 3731 c = UCHARAT(PL_reginput);
1aa99e6b 3732 }
c277df42 3733 /* If it could work, try it. */
eb160463 3734 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3735 {
29d1e993 3736 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3737 REGCP_UNWIND(lastcp);
c277df42
IZ
3738 }
3739 /* Couldn't or didn't -- back up. */
3740 n--;
dfe13c55 3741 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3742 }
a0ed51b3
LW
3743 }
3744 else {
8fa7f367 3745 UV c = 0;
c277df42 3746 while (n >= ln) {
1aa99e6b
IH
3747 if (c1 != -1000) {
3748 if (do_utf8)
872c91ae
JH
3749 c = utf8n_to_uvchr((U8*)PL_reginput,
3750 UTF8_MAXLEN, 0,
3751 ckWARN(WARN_UTF8) ?
3752 0 : UTF8_ALLOW_ANY);
1aa99e6b 3753 else
9041c2e3 3754 c = UCHARAT(PL_reginput);
1aa99e6b 3755 }
c277df42 3756 /* If it could work, try it. */
eb160463 3757 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3758 {
29d1e993 3759 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3760 REGCP_UNWIND(lastcp);
c277df42
IZ
3761 }
3762 /* Couldn't or didn't -- back up. */
3763 n--;
dfe13c55 3764 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3765 }
a0d0e21e
LW
3766 }
3767 }
4633a7c4 3768 sayNO;
c277df42 3769 break;
a0d0e21e 3770 case END:
0f5d15d6
IZ
3771 if (PL_reg_call_cc) {
3772 re_cc_state *cur_call_cc = PL_reg_call_cc;
3773 CURCUR *cctmp = PL_regcc;
3774 regexp *re = PL_reg_re;
3775 CHECKPOINT cp, lastcp;
3776
3777 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3778 REGCP_SET(lastcp);
0f5d15d6
IZ
3779 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3780 the caller. */
3781 PL_reginput = locinput; /* Make position available to
3782 the callcc. */
3783 cache_re(PL_reg_call_cc->re);
3784 PL_regcc = PL_reg_call_cc->cc;
3785 PL_reg_call_cc = PL_reg_call_cc->prev;
3786 if (regmatch(cur_call_cc->node)) {
3787 PL_reg_call_cc = cur_call_cc;
3788 regcpblow(cp);
3789 sayYES;
3790 }
02db2b7b 3791 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3792 regcppop();
3793 PL_reg_call_cc = cur_call_cc;
3794 PL_regcc = cctmp;
3795 PL_reg_re = re;
3796 cache_re(re);
3797
3798 DEBUG_r(
3799 PerlIO_printf(Perl_debug_log,
3800 "%*s continuation failed...\n",
3801 REPORT_CODE_OFF+PL_regindent*2, "")
3802 );
7821416a 3803 sayNO_SILENT;
0f5d15d6 3804 }
7821416a
IZ
3805 if (locinput < PL_regtill) {
3806 DEBUG_r(PerlIO_printf(Perl_debug_log,
3807 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3808 PL_colors[4],
3809 (long)(locinput - PL_reg_starttry),
3810 (long)(PL_regtill - PL_reg_starttry),
3811 PL_colors[5]));
3812 sayNO_FINAL; /* Cannot match: too short. */
3813 }
3814 PL_reginput = locinput; /* put where regtry can find it */
3815 sayYES_FINAL; /* Success! */
7e5428c5 3816 case SUCCEED:
3280af22 3817 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3818 sayYES_LOUD; /* Success! */
c277df42
IZ
3819 case SUSPEND:
3820 n = 1;
9fe1d20c 3821 PL_reginput = locinput;
9041c2e3 3822 goto do_ifmatch;
a0d0e21e 3823 case UNLESSM:
c277df42 3824 n = 0;
a0ed51b3 3825 if (scan->flags) {
efb30f32
HS
3826 s = HOPBACKc(locinput, scan->flags);
3827 if (!s)
3828 goto say_yes;
3829 PL_reginput = s;
a0ed51b3
LW
3830 }
3831 else
3832 PL_reginput = locinput;
c277df42
IZ
3833 goto do_ifmatch;
3834 case IFMATCH:
3835 n = 1;
a0ed51b3 3836 if (scan->flags) {
efb30f32
HS
3837 s = HOPBACKc(locinput, scan->flags);
3838 if (!s)
3839 goto say_no;
3840 PL_reginput = s;
a0ed51b3
LW
3841 }
3842 else
3843 PL_reginput = locinput;
3844
c277df42 3845 do_ifmatch:
c277df42
IZ
3846 inner = NEXTOPER(NEXTOPER(scan));
3847 if (regmatch(inner) != n) {
3848 say_no:
3849 if (logical) {
3850 logical = 0;
3851 sw = 0;
3852 goto do_longjump;
a0ed51b3
LW
3853 }
3854 else
c277df42
IZ
3855 sayNO;
3856 }
3857 say_yes:
3858 if (logical) {
3859 logical = 0;
3860 sw = 1;
3861 }
fe44a5e8 3862 if (OP(scan) == SUSPEND) {
3280af22 3863 locinput = PL_reginput;
565764a8 3864 nextchr = UCHARAT(locinput);
fe44a5e8 3865 }
c277df42
IZ
3866 /* FALL THROUGH. */
3867 case LONGJMP:
3868 do_longjump:
3869 next = scan + ARG(scan);
3870 if (next == scan)
3871 next = NULL;
a0d0e21e
LW
3872 break;
3873 default:
b900a521 3874 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3875 PTR2UV(scan), OP(scan));
cea2e8a9 3876 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3877 }
02db2b7b 3878 reenter:
a0d0e21e
LW
3879 scan = next;
3880 }
a687059c 3881
a0d0e21e
LW
3882 /*
3883 * We get here only if there's trouble -- normally "case END" is
3884 * the terminating point.
3885 */
cea2e8a9 3886 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3887 /*NOTREACHED*/
4633a7c4
LW
3888 sayNO;
3889
7821416a
IZ
3890yes_loud:
3891 DEBUG_r(
3892 PerlIO_printf(Perl_debug_log,
3893 "%*s %scould match...%s\n",
3894 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3895 );
3896 goto yes;
3897yes_final:
3898 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3899 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3900yes:
3901#ifdef DEBUGGING
3280af22 3902 PL_regindent--;
4633a7c4 3903#endif
02db2b7b
IZ
3904
3905#if 0 /* Breaks $^R */
3906 if (unwind)
3907 regcpblow(firstcp);
3908#endif
4633a7c4
LW
3909 return 1;
3910
3911no:
7821416a
IZ
3912 DEBUG_r(
3913 PerlIO_printf(Perl_debug_log,
3914 "%*s %sfailed...%s\n",
3915 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3916 );
3917 goto do_no;
3918no_final:
3919do_no:
02db2b7b
IZ
3920 if (unwind) {
3921 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3922
3923 switch (uw->type) {
3924 case RE_UNWIND_BRANCH:
3925 case RE_UNWIND_BRANCHJ:
3926 {
3927 re_unwind_branch_t *uwb = &(uw->branch);
3928 I32 lastparen = uwb->lastparen;
9041c2e3 3929
02db2b7b
IZ
3930 REGCP_UNWIND(uwb->lastcp);
3931 for (n = *PL_reglastparen; n > lastparen; n--)
3932 PL_regendp[n] = -1;
3933 *PL_reglastparen = n;
3934 scan = next = uwb->next;
9041c2e3
NIS
3935 if ( !scan ||
3936 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3937 ? BRANCH : BRANCHJ) ) { /* Failure */
3938 unwind = uwb->prev;
3939#ifdef DEBUGGING
3940 PL_regindent--;
3941#endif
3942 goto do_no;
3943 }
3944 /* Have more choice yet. Reuse the same uwb. */
3945 /*SUPPRESS 560*/
3946 if ((n = (uwb->type == RE_UNWIND_BRANCH
3947 ? NEXT_OFF(next) : ARG(next))))
3948 next += n;
3949 else
3950 next = NULL; /* XXXX Needn't unwinding in this case... */
3951 uwb->next = next;
3952 next = NEXTOPER(scan);
3953 if (uwb->type == RE_UNWIND_BRANCHJ)
3954 next = NEXTOPER(next);
3955 locinput = uwb->locinput;
3956 nextchr = uwb->nextchr;
3957#ifdef DEBUGGING
3958 PL_regindent = uwb->regindent;
3959#endif
3960
3961 goto reenter;
3962 }
3963 /* NOT REACHED */
3964 default:
3965 Perl_croak(aTHX_ "regexp unwind memory corruption");
3966 }
3967 /* NOT REACHED */
3968 }
4633a7c4 3969#ifdef DEBUGGING
3280af22 3970 PL_regindent--;
4633a7c4 3971#endif
a0d0e21e 3972 return 0;
a687059c
LW
3973}
3974
3975/*
3976 - regrepeat - repeatedly match something simple, report how many
3977 */
3978/*
3979 * [This routine now assumes that it will only match on things of length 1.
3980 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 3981 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 3982 */
76e3520e 3983STATIC I32
cea2e8a9 3984S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 3985{
a0d0e21e 3986 register char *scan;
a0d0e21e 3987 register I32 c;
3280af22 3988 register char *loceol = PL_regeol;
a0ed51b3 3989 register I32 hardcount = 0;
53c4c00c 3990 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 3991
3280af22 3992 scan = PL_reginput;
faf11cac
HS
3993 if (max == REG_INFTY)
3994 max = I32_MAX;
3995 else if (max < loceol - scan)
a0d0e21e 3996 loceol = scan + max;
a0d0e21e 3997 switch (OP(p)) {
22c35a8c 3998 case REG_ANY:
1aa99e6b 3999 if (do_utf8) {
ffc61ed2 4000 loceol = PL_regeol;
1aa99e6b 4001 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4002 scan += UTF8SKIP(scan);
4003 hardcount++;
4004 }
4005 } else {
4006 while (scan < loceol && *scan != '\n')
4007 scan++;
a0ed51b3
LW
4008 }
4009 break;
ffc61ed2 4010 case SANY:
def8e4ea
JH
4011 if (do_utf8) {
4012 loceol = PL_regeol;
a0804c9e 4013 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4014 scan += UTF8SKIP(scan);
4015 hardcount++;
4016 }
4017 }
4018 else
4019 scan = loceol;
a0ed51b3 4020 break;
f33976b4
DB
4021 case CANY:
4022 scan = loceol;
4023 break;
bbce6d69 4024 case EXACT: /* length of string is 1 */
cd439c50 4025 c = (U8)*STRING(p);
bbce6d69 4026 while (scan < loceol && UCHARAT(scan) == c)
4027 scan++;
4028 break;
4029 case EXACTF: /* length of string is 1 */
cd439c50 4030 c = (U8)*STRING(p);
bbce6d69 4031 while (scan < loceol &&
22c35a8c 4032 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4033 scan++;
4034 break;
4035 case EXACTFL: /* length of string is 1 */
3280af22 4036 PL_reg_flags |= RF_tainted;
cd439c50 4037 c = (U8)*STRING(p);
bbce6d69 4038 while (scan < loceol &&
22c35a8c 4039 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4040 scan++;
4041 break;
4042 case ANYOF:
ffc61ed2
JH
4043 if (do_utf8) {
4044 loceol = PL_regeol;
1aa99e6b 4045 while (hardcount < max && scan < loceol &&
ba7b4546 4046 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4047 scan += UTF8SKIP(scan);
4048 hardcount++;
4049 }
4050 } else {
7d3e948e 4051 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4052 scan++;
4053 }
a0d0e21e
LW
4054 break;
4055 case ALNUM:
1aa99e6b 4056 if (do_utf8) {
ffc61ed2 4057 loceol = PL_regeol;
8269fa76 4058 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4059 while (hardcount < max && scan < loceol &&
3568d838 4060 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4061 scan += UTF8SKIP(scan);
4062 hardcount++;
4063 }
4064 } else {
4065 while (scan < loceol && isALNUM(*scan))
4066 scan++;
a0ed51b3
LW
4067 }
4068 break;
bbce6d69 4069 case ALNUML:
3280af22 4070 PL_reg_flags |= RF_tainted;
1aa99e6b 4071 if (do_utf8) {
ffc61ed2 4072 loceol = PL_regeol;
1aa99e6b
IH
4073 while (hardcount < max && scan < loceol &&
4074 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4075 scan += UTF8SKIP(scan);
4076 hardcount++;
4077 }
4078 } else {
4079 while (scan < loceol && isALNUM_LC(*scan))
4080 scan++;
a0ed51b3
LW
4081 }
4082 break;
a0d0e21e 4083 case NALNUM:
1aa99e6b 4084 if (do_utf8) {
ffc61ed2 4085 loceol = PL_regeol;
8269fa76 4086 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4087 while (hardcount < max && scan < loceol &&
3568d838 4088 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4089 scan += UTF8SKIP(scan);
4090 hardcount++;
4091 }
4092 } else {
4093 while (scan < loceol && !isALNUM(*scan))
4094 scan++;
a0ed51b3
LW
4095 }
4096 break;
bbce6d69 4097 case NALNUML:
3280af22 4098 PL_reg_flags |= RF_tainted;
1aa99e6b 4099 if (do_utf8) {
ffc61ed2 4100 loceol = PL_regeol;
1aa99e6b
IH
4101 while (hardcount < max && scan < loceol &&
4102 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4103 scan += UTF8SKIP(scan);
4104 hardcount++;
4105 }
4106 } else {
4107 while (scan < loceol && !isALNUM_LC(*scan))
4108 scan++;
a0ed51b3
LW
4109 }
4110 break;
a0d0e21e 4111 case SPACE:
1aa99e6b 4112 if (do_utf8) {
ffc61ed2 4113 loceol = PL_regeol;
8269fa76 4114 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4115 while (hardcount < max && scan < loceol &&
3568d838
JH
4116 (*scan == ' ' ||
4117 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4118 scan += UTF8SKIP(scan);
4119 hardcount++;
4120 }
4121 } else {
4122 while (scan < loceol && isSPACE(*scan))
4123 scan++;
a0ed51b3
LW
4124 }
4125 break;
bbce6d69 4126 case SPACEL:
3280af22 4127 PL_reg_flags |= RF_tainted;
1aa99e6b 4128 if (do_utf8) {
ffc61ed2 4129 loceol = PL_regeol;
1aa99e6b 4130 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4131 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4132 scan += UTF8SKIP(scan);
4133 hardcount++;
4134 }
4135 } else {
4136 while (scan < loceol && isSPACE_LC(*scan))
4137 scan++;
a0ed51b3
LW
4138 }
4139 break;
a0d0e21e 4140 case NSPACE:
1aa99e6b 4141 if (do_utf8) {
ffc61ed2 4142 loceol = PL_regeol;
8269fa76 4143 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4144 while (hardcount < max && scan < loceol &&
3568d838
JH
4145 !(*scan == ' ' ||
4146 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4147 scan += UTF8SKIP(scan);
4148 hardcount++;
4149 }
4150 } else {
4151 while (scan < loceol && !isSPACE(*scan))
4152 scan++;
4153 break;
a0ed51b3 4154 }
bbce6d69 4155 case NSPACEL:
3280af22 4156 PL_reg_flags |= RF_tainted;
1aa99e6b 4157 if (do_utf8) {
ffc61ed2 4158 loceol = PL_regeol;
1aa99e6b 4159 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4160 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4161 scan += UTF8SKIP(scan);
4162 hardcount++;
4163 }
4164 } else {
4165 while (scan < loceol && !isSPACE_LC(*scan))
4166 scan++;
a0ed51b3
LW
4167 }
4168 break;
a0d0e21e 4169 case DIGIT:
1aa99e6b 4170 if (do_utf8) {
ffc61ed2 4171 loceol = PL_regeol;
8269fa76 4172 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4173 while (hardcount < max && scan < loceol &&
3568d838 4174 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4175 scan += UTF8SKIP(scan);
4176 hardcount++;
4177 }
4178 } else {
4179 while (scan < loceol && isDIGIT(*scan))
4180 scan++;
a0ed51b3
LW
4181 }
4182 break;
a0d0e21e 4183 case NDIGIT:
1aa99e6b 4184 if (do_utf8) {
ffc61ed2 4185 loceol = PL_regeol;
8269fa76 4186 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4187 while (hardcount < max && scan < loceol &&
3568d838 4188 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4189 scan += UTF8SKIP(scan);
4190 hardcount++;
4191 }
4192 } else {
4193 while (scan < loceol && !isDIGIT(*scan))
4194 scan++;
a0ed51b3
LW
4195 }
4196 break;
a0d0e21e
LW
4197 default: /* Called on something of 0 width. */
4198 break; /* So match right here or not at all. */
4199 }
a687059c 4200
a0ed51b3
LW
4201 if (hardcount)
4202 c = hardcount;
4203 else
4204 c = scan - PL_reginput;
3280af22 4205 PL_reginput = scan;
a687059c 4206
9041c2e3 4207 DEBUG_r(
c277df42
IZ
4208 {
4209 SV *prop = sv_newmortal();
4210
4211 regprop(prop, p);
9041c2e3
NIS
4212 PerlIO_printf(Perl_debug_log,
4213 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4214 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4215 });
9041c2e3 4216
a0d0e21e 4217 return(c);
a687059c
LW
4218}
4219
4220/*
c277df42 4221 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4222 *
c277df42
IZ
4223 * The repeater is supposed to have constant length.
4224 */
4225
76e3520e 4226STATIC I32
cea2e8a9 4227S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4228{
b7953727 4229 register char *scan = Nullch;
c277df42 4230 register char *start;
3280af22 4231 register char *loceol = PL_regeol;
a0ed51b3 4232 I32 l = 0;
708e3b05 4233 I32 count = 0, res = 1;
a0ed51b3
LW
4234
4235 if (!max)
4236 return 0;
c277df42 4237
3280af22 4238 start = PL_reginput;
53c4c00c 4239 if (PL_reg_match_utf8) {
708e3b05 4240 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4241 if (!count++) {
4242 l = 0;
4243 while (start < PL_reginput) {
4244 l++;
4245 start += UTF8SKIP(start);
4246 }
4247 *lp = l;
4248 if (l == 0)
4249 return max;
4250 }
4251 if (count == max)
4252 return count;
4253 }
4254 }
4255 else {
708e3b05 4256 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4257 if (!count++) {
4258 *lp = l = PL_reginput - start;
4259 if (max != REG_INFTY && l*max < loceol - scan)
4260 loceol = scan + l*max;
4261 if (l == 0)
4262 return max;
c277df42
IZ
4263 }
4264 }
4265 }
708e3b05 4266 if (!res)
3280af22 4267 PL_reginput = scan;
9041c2e3 4268
a0ed51b3 4269 return count;
c277df42
IZ
4270}
4271
4272/*
ffc61ed2
JH
4273- regclass_swash - prepare the utf8 swash
4274*/
4275
4276SV *
9e55ce06 4277Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4278{
9e55ce06
JH
4279 SV *sw = NULL;
4280 SV *si = NULL;
4281 SV *alt = NULL;
ffc61ed2
JH
4282
4283 if (PL_regdata && PL_regdata->count) {
4284 U32 n = ARG(node);
4285
4286 if (PL_regdata->what[n] == 's') {
4287 SV *rv = (SV*)PL_regdata->data[n];
4288 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4289 SV **ary = AvARRAY(av);
9e55ce06 4290 SV **a, **b;
9041c2e3 4291
9e55ce06
JH
4292 /* See the end of regcomp.c:S_reglass() for
4293 * documentation of these array elements. */
4294
b11f357e
JH
4295 si = *ary;
4296 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4297 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4298
ffc61ed2
JH
4299 if (a)
4300 sw = *a;
4301 else if (si && doinit) {
4302 sw = swash_init("utf8", "", si, 1, 0);
4303 (void)av_store(av, 1, sw);
4304 }
9e55ce06
JH
4305 if (b)
4306 alt = *b;
ffc61ed2
JH
4307 }
4308 }
4309
9e55ce06
JH
4310 if (listsvp)
4311 *listsvp = si;
4312 if (altsvp)
4313 *altsvp = alt;
ffc61ed2
JH
4314
4315 return sw;
4316}
4317
4318/*
ba7b4546 4319 - reginclass - determine if a character falls into a character class
832705d4
JH
4320
4321 The n is the ANYOF regnode, the p is the target string, lenp
4322 is pointer to the maximum length of how far to go in the p
4323 (if the lenp is zero, UTF8SKIP(p) is used),
4324 do_utf8 tells whether the target string is in UTF-8.
4325
bbce6d69 4326 */
4327
76e3520e 4328STATIC bool
ba7b4546 4329S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4330{
ffc61ed2 4331 char flags = ANYOF_FLAGS(n);
bbce6d69 4332 bool match = FALSE;
cc07378b 4333 UV c = *p;
ae9ddab8 4334 STRLEN len = 0;
9e55ce06 4335 STRLEN plen;
1aa99e6b 4336
ae9ddab8
JH
4337 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4338 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4339 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4340
0f0076b4 4341 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4342 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4343 if (lenp)
4344 *lenp = 0;
ffc61ed2 4345 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4346 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4347 match = TRUE;
bbce6d69 4348 }
3568d838 4349 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4350 match = TRUE;
ffc61ed2 4351 if (!match) {
9e55ce06
JH
4352 AV *av;
4353 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4354
4355 if (sw) {
3568d838 4356 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4357 match = TRUE;
4358 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4359 if (!match && lenp && av) {
4360 I32 i;
4361
4362 for (i = 0; i <= av_len(av); i++) {
4363 SV* sv = *av_fetch(av, i, FALSE);
4364 STRLEN len;
4365 char *s = SvPV(sv, len);
4366
061b10df 4367 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4368 *lenp = len;
4369 match = TRUE;
4370 break;
4371 }
4372 }
4373 }
4374 if (!match) {
4a623e43
JH
4375 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4376 STRLEN tmplen;
4377
9e55ce06
JH
4378 to_utf8_fold(p, tmpbuf, &tmplen);
4379 if (swash_fetch(sw, tmpbuf, do_utf8))
4380 match = TRUE;
4381 }
ffc61ed2
JH
4382 }
4383 }
bbce6d69 4384 }
9e55ce06 4385 if (match && lenp && *lenp == 0)
0f0076b4 4386 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4387 }
1aa99e6b 4388 if (!match && c < 256) {
ffc61ed2
JH
4389 if (ANYOF_BITMAP_TEST(n, c))
4390 match = TRUE;
4391 else if (flags & ANYOF_FOLD) {
eb160463 4392 U8 f;
a0ed51b3 4393
ffc61ed2
JH
4394 if (flags & ANYOF_LOCALE) {
4395 PL_reg_flags |= RF_tainted;
4396 f = PL_fold_locale[c];
4397 }
4398 else
4399 f = PL_fold[c];
4400 if (f != c && ANYOF_BITMAP_TEST(n, f))
4401 match = TRUE;
4402 }
4403
4404 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4405 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4406 if (
4407 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4408 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4409 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4410 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4411 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4412 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4413 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4414 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4415 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4416 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4417 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4418 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4419 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4420 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4421 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4422 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4423 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4424 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4425 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4426 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4427 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4428 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4429 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4430 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4431 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4432 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4433 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4434 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4435 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4436 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4437 ) /* How's that for a conditional? */
4438 {
4439 match = TRUE;
4440 }
a0ed51b3 4441 }
a0ed51b3
LW
4442 }
4443
a0ed51b3
LW
4444 return (flags & ANYOF_INVERT) ? !match : match;
4445}
161b471a 4446
dfe13c55 4447STATIC U8 *
cea2e8a9 4448S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4449{
1aa99e6b
IH
4450 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4451}
4452
4453STATIC U8 *
4454S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4455{
a0ed51b3 4456 if (off >= 0) {
1aa99e6b 4457 while (off-- && s < lim) {
ffc61ed2 4458 /* XXX could check well-formedness here */
a0ed51b3 4459 s += UTF8SKIP(s);
ffc61ed2 4460 }
a0ed51b3
LW
4461 }
4462 else {
4463 while (off++) {
1aa99e6b 4464 if (s > lim) {
a0ed51b3 4465 s--;
ffc61ed2 4466 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4467 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4468 s--;
ffc61ed2
JH
4469 }
4470 /* XXX could check well-formedness here */
a0ed51b3
LW
4471 }
4472 }
4473 }
4474 return s;
4475}
161b471a 4476
dfe13c55 4477STATIC U8 *
1aa99e6b 4478S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4479{
1aa99e6b
IH
4480 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4481}
4482
4483STATIC U8 *
4484S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4485{
4486 if (off >= 0) {
1aa99e6b 4487 while (off-- && s < lim) {
ffc61ed2 4488 /* XXX could check well-formedness here */
a0ed51b3 4489 s += UTF8SKIP(s);
ffc61ed2 4490 }
a0ed51b3
LW
4491 if (off >= 0)
4492 return 0;
4493 }
4494 else {
4495 while (off++) {
1aa99e6b 4496 if (s > lim) {
a0ed51b3 4497 s--;
ffc61ed2 4498 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4499 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4500 s--;
ffc61ed2
JH
4501 }
4502 /* XXX could check well-formedness here */
a0ed51b3
LW
4503 }
4504 else
4505 break;
4506 }
4507 if (off <= 0)
4508 return 0;
4509 }
4510 return s;
4511}
51371543 4512
51371543 4513static void
acfe0abc 4514restore_pos(pTHX_ void *arg)
51371543 4515{
51371543
GS
4516 if (PL_reg_eval_set) {
4517 if (PL_reg_oldsaved) {
4518 PL_reg_re->subbeg = PL_reg_oldsaved;
4519 PL_reg_re->sublen = PL_reg_oldsavedlen;
4520 RX_MATCH_COPIED_on(PL_reg_re);
4521 }
4522 PL_reg_magic->mg_len = PL_reg_oldpos;
4523 PL_reg_eval_set = 0;
4524 PL_curpm = PL_reg_oldcurpm;
4525 }
4526}
33b8afdf
JH
4527
4528STATIC void
4529S_to_utf8_substr(pTHX_ register regexp *prog)
4530{
4531 SV* sv;
4532 if (prog->float_substr && !prog->float_utf8) {
4533 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4534 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4535 sv_utf8_upgrade(sv);
4536 if (SvTAIL(prog->float_substr))
4537 SvTAIL_on(sv);
4538 if (prog->float_substr == prog->check_substr)
4539 prog->check_utf8 = sv;
4540 }
4541 if (prog->anchored_substr && !prog->anchored_utf8) {
4542 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4543 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4544 sv_utf8_upgrade(sv);
4545 if (SvTAIL(prog->anchored_substr))
4546 SvTAIL_on(sv);
4547 if (prog->anchored_substr == prog->check_substr)
4548 prog->check_utf8 = sv;
4549 }
4550}
4551
4552STATIC void
4553S_to_byte_substr(pTHX_ register regexp *prog)
4554{
4555 SV* sv;
4556 if (prog->float_utf8 && !prog->float_substr) {
4557 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4558 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4559 if (sv_utf8_downgrade(sv, TRUE)) {
4560 if (SvTAIL(prog->float_utf8))
4561 SvTAIL_on(sv);
4562 } else {
4563 SvREFCNT_dec(sv);
4564 prog->float_substr = sv = &PL_sv_undef;
4565 }
4566 if (prog->float_utf8 == prog->check_utf8)
4567 prog->check_substr = sv;
4568 }
4569 if (prog->anchored_utf8 && !prog->anchored_substr) {
4570 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4571 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4572 if (sv_utf8_downgrade(sv, TRUE)) {
4573 if (SvTAIL(prog->anchored_utf8))
4574 SvTAIL_on(sv);
4575 } else {
4576 SvREFCNT_dec(sv);
4577 prog->anchored_substr = sv = &PL_sv_undef;
4578 }
4579 if (prog->anchored_utf8 == prog->check_utf8)
4580 prog->check_substr = sv;
4581 }
4582}