This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Whitespace and indentation fix in the output of B::Debug.
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
be3c0a43 70 **** Copyright (c) 1991-2002, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
eb160463 90#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
ba7b4546 99#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 100
c277df42
IZ
101/*
102 * Forwards.
103 */
104
33b8afdf 105#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 106#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 107
dfe13c55
GS
108#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
109#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
110#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
111#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
112#define HOPc(pos,off) ((char*)HOP(pos,off))
113#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 114
efb30f32 115#define HOPBACK(pos, off) ( \
e54858b0 116 (PL_reg_match_utf8) \
efb30f32
HS
117 ? reghopmaybe((U8*)pos, -off) \
118 : (pos - off >= PL_bostr) \
119 ? (U8*)(pos - off) \
120 : (U8*)NULL \
121)
122#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
123
1aa99e6b
IH
124#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
125#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
127#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
130
977e4b84 131#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 132
5f80c4cf 133/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
134#define JUMPABLE(rn) ( \
135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
137 OP(rn) == PLUS || OP(rn) == MINMOD || \
138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
139)
140
cca55fe3
JP
141#define HAS_TEXT(rn) ( \
142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
143)
e2d8ce26 144
a84d97b6
HS
145/*
146 Search for mandatory following text node; for lookahead, the text must
147 follow but for lookbehind (rn->flags != 0) we skip to the next step.
148*/
cca55fe3 149#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 150 while (JUMPABLE(rn)) \
a84d97b6 151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 152 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
153 else if (OP(rn) == PLUS) \
154 rn = NEXTOPER(rn); \
a84d97b6
HS
155 else if (OP(rn) == IFMATCH) \
156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 157 else rn += NEXT_OFF(rn); \
5f80c4cf 158} STMT_END
74750237 159
acfe0abc 160static void restore_pos(pTHX_ void *arg);
51371543 161
76e3520e 162STATIC CHECKPOINT
cea2e8a9 163S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 164{
3280af22 165 int retval = PL_savestack_ix;
b1ce53c5
JH
166#define REGCP_PAREN_ELEMS 4
167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
168 int p;
169
e49a9654
IH
170 if (paren_elems_to_push < 0)
171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
172
a01268b5 173#define REGCP_OTHER_ELEMS 6
b1ce53c5 174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 175 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 176/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
177 SSPUSHINT(PL_regendp[p]);
178 SSPUSHINT(PL_regstartp[p]);
3280af22 179 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
180 SSPUSHINT(p);
181 }
b1ce53c5 182/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
183 SSPUSHINT(PL_regsize);
184 SSPUSHINT(*PL_reglastparen);
a01268b5 185 SSPUSHINT(*PL_reglastcloseparen);
3280af22 186 SSPUSHPTR(PL_reginput);
41123dfd
JH
187#define REGCP_FRAME_ELEMS 2
188/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
189 * are needed for the regexp context stack bookkeeping. */
190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 192
a0d0e21e
LW
193 return retval;
194}
195
c277df42 196/* These are needed since we do not localize EVAL nodes: */
02db2b7b 197# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 199 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 200
02db2b7b 201# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 202 PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 205
76e3520e 206STATIC char *
cea2e8a9 207S_regcppop(pTHX)
a0d0e21e 208{
b1ce53c5 209 I32 i;
a0d0e21e
LW
210 U32 paren = 0;
211 char *input;
cf93c79d 212 I32 tmps;
b1ce53c5
JH
213
214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 215 i = SSPOPINT;
b1ce53c5
JH
216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
217 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 218 input = (char *) SSPOPPTR;
a01268b5 219 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
220 *PL_reglastparen = SSPOPINT;
221 PL_regsize = SSPOPINT;
b1ce53c5
JH
222
223 /* Now restore the parentheses context. */
41123dfd
JH
224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
225 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 226 paren = (U32)SSPOPINT;
3280af22 227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
228 PL_regstartp[paren] = SSPOPINT;
229 tmps = SSPOPINT;
3280af22
NIS
230 if (paren <= *PL_reglastparen)
231 PL_regendp[paren] = tmps;
c277df42 232 DEBUG_r(
c3464db5 233 PerlIO_printf(Perl_debug_log,
b900a521 234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 235 (UV)paren, (IV)PL_regstartp[paren],
b900a521 236 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 237 (IV)PL_regendp[paren],
3280af22 238 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 239 );
a0d0e21e 240 }
c277df42 241 DEBUG_r(
eb160463 242 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 243 PerlIO_printf(Perl_debug_log,
faccc32b
JH
244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
246 }
247 );
daf18116 248#if 1
dafc8851
JH
249 /* It would seem that the similar code in regtry()
250 * already takes care of this, and in fact it is in
251 * a better location to since this code can #if 0-ed out
252 * but the code in regtry() is needed or otherwise tests
253 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
254 * (as of patchlevel 7877) will fail. Then again,
255 * this code seems to be necessary or otherwise
256 * building DynaLoader will fail:
257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
258 * --jhi */
eb160463
GS
259 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
260 if ((I32)paren > PL_regsize)
cf93c79d
IZ
261 PL_regstartp[paren] = -1;
262 PL_regendp[paren] = -1;
a0d0e21e 263 }
dafc8851 264#endif
a0d0e21e
LW
265 return input;
266}
267
0f5d15d6 268STATIC char *
cea2e8a9 269S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
270{
271 I32 tmp = PL_savestack_ix;
272
273 PL_savestack_ix = ss;
274 regcppop();
275 PL_savestack_ix = tmp;
942e002e 276 return Nullch;
0f5d15d6
IZ
277}
278
279typedef struct re_cc_state
280{
281 I32 ss;
282 regnode *node;
283 struct re_cc_state *prev;
284 CURCUR *cc;
285 regexp *re;
286} re_cc_state;
287
02db2b7b 288#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 289
29d1e993
HS
290#define TRYPAREN(paren, n, input) { \
291 if (paren) { \
292 if (n) { \
293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
294 PL_regendp[paren] = input - PL_bostr; \
295 } \
296 else \
297 PL_regendp[paren] = -1; \
298 } \
299 if (regmatch(next)) \
300 sayYES; \
301 if (paren && n) \
302 PL_regendp[paren] = -1; \
303}
304
305
a687059c 306/*
e50aee73 307 * pregexec and friends
a687059c
LW
308 */
309
310/*
c277df42 311 - pregexec - match a regexp against a string
a687059c 312 */
c277df42 313I32
864dbfa3 314Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 315 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
316/* strend: pointer to null at end of string */
317/* strbeg: real beginning of string */
318/* minend: end of match must be >=minend after stringarg. */
319/* nosave: For optimizations. */
320{
321 return
9041c2e3 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
323 nosave ? 0 : REXEC_COPY_STR);
324}
0f5d15d6
IZ
325
326STATIC void
cea2e8a9 327S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
328{
329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
330#ifdef DEBUGGING
331 PL_regprogram = prog->program;
332#endif
333 PL_regnpar = prog->nparens;
9041c2e3
NIS
334 PL_regdata = prog->data;
335 PL_reg_re = prog;
0f5d15d6 336}
22e551b9 337
9041c2e3 338/*
cad2e5aa
JH
339 * Need to implement the following flags for reg_anch:
340 *
341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
342 * USE_INTUIT_ML
343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
344 * INTUIT_AUTORITATIVE_ML
345 * INTUIT_ONCE_NOML - Intuit can match in one location only.
346 * INTUIT_ONCE_ML
347 *
348 * Another flag for this function: SECOND_TIME (so that float substrs
349 * with giant delta may be not rechecked).
350 */
351
352/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
353
2c2d71f5 354/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
355 Otherwise, only SvCUR(sv) is used to get strbeg. */
356
357/* XXXX We assume that strpos is strbeg unless sv. */
358
6eb5f6b9
JH
359/* XXXX Some places assume that there is a fixed substring.
360 An update may be needed if optimizer marks as "INTUITable"
361 RExen without fixed substrings. Similarly, it is assumed that
362 lengths of all the strings are no more than minlen, thus they
363 cannot come from lookahead.
364 (Or minlen should take into account lookahead.) */
365
2c2d71f5
JH
366/* A failure to find a constant substring means that there is no need to make
367 an expensive call to REx engine, thus we celebrate a failure. Similarly,
368 finding a substring too deep into the string means that less calls to
30944b6d
IZ
369 regtry() should be needed.
370
371 REx compiler's optimizer found 4 possible hints:
372 a) Anchored substring;
373 b) Fixed substring;
374 c) Whether we are anchored (beginning-of-line or \G);
375 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 376 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
377 string which does not contradict any of them.
378 */
2c2d71f5 379
6eb5f6b9
JH
380/* Most of decisions we do here should have been done at compile time.
381 The nodes of the REx which we used for the search should have been
382 deleted from the finite automaton. */
383
cad2e5aa
JH
384char *
385Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
386 char *strend, U32 flags, re_scream_pos_data *data)
387{
b7953727 388 register I32 start_shift = 0;
cad2e5aa 389 /* Should be nonnegative! */
b7953727 390 register I32 end_shift = 0;
2c2d71f5
JH
391 register char *s;
392 register SV *check;
a1933d95 393 char *strbeg;
cad2e5aa 394 char *t;
33b8afdf 395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 396 I32 ml_anch;
6eb5f6b9 397 register char *other_last = Nullch; /* other substr checked before this */
b7953727 398 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
399#ifdef DEBUGGING
400 char *i_strpos = strpos;
ce333219 401 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 402#endif
cad2e5aa 403
b8d68ded
JH
404 if (prog->reganch & ROPT_UTF8) {
405 DEBUG_r(PerlIO_printf(Perl_debug_log,
406 "UTF-8 regex...\n"));
407 PL_reg_flags |= RF_utf8;
408 }
409
2a782b5b 410 DEBUG_r({
b8d68ded 411 char *s = PL_reg_match_utf8 ?
c728cb41
JH
412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
413 strpos;
b8d68ded
JH
414 int len = PL_reg_match_utf8 ?
415 strlen(s) : strend - strpos;
2a782b5b
JH
416 if (!PL_colorset)
417 reginitcolors();
b8d68ded
JH
418 if (PL_reg_match_utf8)
419 DEBUG_r(PerlIO_printf(Perl_debug_log,
420 "UTF-8 target...\n"));
2a782b5b
JH
421 PerlIO_printf(Perl_debug_log,
422 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
423 PL_colors[4],PL_colors[5],PL_colors[0],
424 prog->precomp,
425 PL_colors[1],
426 (strlen(prog->precomp) > 60 ? "..." : ""),
427 PL_colors[0],
428 (int)(len > 60 ? 60 : len),
429 s, PL_colors[1],
430 (len > 60 ? "..." : "")
431 );
432 });
cad2e5aa 433
c344f387
JH
434 /* CHR_DIST() would be more correct here but it makes things slow. */
435 if (prog->minlen > strend - strpos) {
a72c7584
JH
436 DEBUG_r(PerlIO_printf(Perl_debug_log,
437 "String too short... [re_intuit_start]\n"));
cad2e5aa 438 goto fail;
2c2d71f5 439 }
a1933d95 440 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 441 PL_regeol = strend;
33b8afdf
JH
442 if (do_utf8) {
443 if (!prog->check_utf8 && prog->check_substr)
444 to_utf8_substr(prog);
445 check = prog->check_utf8;
446 } else {
447 if (!prog->check_substr && prog->check_utf8)
448 to_byte_substr(prog);
449 check = prog->check_substr;
450 }
451 if (check == &PL_sv_undef) {
452 DEBUG_r(PerlIO_printf(Perl_debug_log,
453 "Non-utf string cannot match utf check string\n"));
454 goto fail;
455 }
2c2d71f5 456 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
457 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
458 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 459 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 460
7e25d62c
JH
461 if (!ml_anch) {
462 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
463 | ROPT_IMPLICIT)) /* not a real BOL */
464 /* SvCUR is not set on references: SvRV and SvPVX overlap */
465 && sv && !SvROK(sv)
466 && (strpos != strbeg)) {
467 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
468 goto fail;
469 }
470 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 471 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 472 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
473 I32 slen;
474
1aa99e6b 475 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
476 if (SvTAIL(check)) {
477 slen = SvCUR(check); /* >= 1 */
cad2e5aa 478
9041c2e3 479 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
480 || (strend - s == slen && strend[-1] != '\n')) {
481 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
482 goto fail_finish;
cad2e5aa
JH
483 }
484 /* Now should match s[0..slen-2] */
485 slen--;
653099ff 486 if (slen && (*SvPVX(check) != *s
cad2e5aa 487 || (slen > 1
653099ff 488 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
489 report_neq:
490 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
491 goto fail_finish;
492 }
cad2e5aa 493 }
653099ff
GS
494 else if (*SvPVX(check) != *s
495 || ((slen = SvCUR(check)) > 1
496 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
497 goto report_neq;
498 goto success_at_start;
7e25d62c 499 }
cad2e5aa 500 }
2c2d71f5 501 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 502 s = strpos;
2c2d71f5 503 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 504 end_shift = prog->minlen - start_shift -
653099ff 505 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 506 if (!ml_anch) {
653099ff
GS
507 I32 end = prog->check_offset_max + CHR_SVLEN(check)
508 - (SvTAIL(check) != 0);
1aa99e6b 509 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
510
511 if (end_shift < eshift)
512 end_shift = eshift;
513 }
cad2e5aa 514 }
2c2d71f5 515 else { /* Can match at random position */
cad2e5aa
JH
516 ml_anch = 0;
517 s = strpos;
2c2d71f5
JH
518 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
519 /* Should be nonnegative! */
520 end_shift = prog->minlen - start_shift -
653099ff 521 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
522 }
523
2c2d71f5 524#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 525 if (end_shift < 0)
6bbae5e6 526 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
527#endif
528
2c2d71f5
JH
529 restart:
530 /* Find a possible match in the region s..strend by looking for
531 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 532 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
533 I32 p = -1; /* Internal iterator of scream. */
534 I32 *pp = data ? data->scream_pos : &p;
535
2c2d71f5
JH
536 if (PL_screamfirst[BmRARE(check)] >= 0
537 || ( BmRARE(check) == '\n'
538 && (BmPREVIOUS(check) == SvCUR(check) - 1)
539 && SvTAIL(check) ))
9041c2e3 540 s = screaminstr(sv, check,
2c2d71f5 541 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 542 else
2c2d71f5 543 goto fail_finish;
cad2e5aa
JH
544 if (data)
545 *data->scream_olds = s;
546 }
f33976b4 547 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
548 s = fbm_instr((U8*)(s + start_shift),
549 (U8*)(strend - end_shift),
550 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 551 else
1aa99e6b
IH
552 s = fbm_instr(HOP3(s, start_shift, strend),
553 HOP3(strend, -end_shift, strbeg),
2c2d71f5 554 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
555
556 /* Update the count-of-usability, remove useless subpatterns,
557 unshift s. */
2c2d71f5
JH
558
559 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
560 (s ? "Found" : "Did not find"),
33b8afdf 561 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 562 PL_colors[0],
7b0972df
JH
563 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
564 SvPVX(check),
2c2d71f5
JH
565 PL_colors[1], (SvTAIL(check) ? "$" : ""),
566 (s ? " at offset " : "...\n") ) );
567
568 if (!s)
569 goto fail_finish;
570
6eb5f6b9
JH
571 check_at = s;
572
2c2d71f5 573 /* Finish the diagnostic message */
30944b6d 574 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
575
576 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
577 Start with the other substr.
578 XXXX no SCREAM optimization yet - and a very coarse implementation
579 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
580 *always* match. Probably should be marked during compile...
581 Probably it is right to do no SCREAM here...
582 */
583
33b8afdf 584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 585 /* Take into account the "other" substring. */
2c2d71f5
JH
586 /* XXXX May be hopelessly wrong for UTF... */
587 if (!other_last)
6eb5f6b9 588 other_last = strpos;
33b8afdf 589 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
590 do_other_anchored:
591 {
1aa99e6b 592 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 593 char *s1 = s;
33b8afdf 594 SV* must;
2c2d71f5 595
2c2d71f5
JH
596 t = s - prog->check_offset_max;
597 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 598 && (!do_utf8
1aa99e6b 599 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 600 && t > strpos)))
30944b6d 601 /* EMPTY */;
2c2d71f5
JH
602 else
603 t = strpos;
1aa99e6b 604 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
605 if (t < other_last) /* These positions already checked */
606 t = other_last;
1aa99e6b 607 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
608 if (last < last1)
609 last1 = last;
610 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
611 /* On end-of-str: see comment below. */
33b8afdf
JH
612 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
613 if (must == &PL_sv_undef) {
614 s = (char*)NULL;
615 DEBUG_r(must = prog->anchored_utf8); /* for debug */
616 }
617 else
618 s = fbm_instr(
619 (unsigned char*)t,
620 HOP3(HOP3(last1, prog->anchored_offset, strend)
621 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
622 must,
623 PL_multiline ? FBMrf_MULTILINE : 0
624 );
1aa99e6b
IH
625 DEBUG_r(PerlIO_printf(Perl_debug_log,
626 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
627 (s ? "Found" : "Contradicts"),
628 PL_colors[0],
33b8afdf
JH
629 (int)(SvCUR(must)
630 - (SvTAIL(must)!=0)),
631 SvPVX(must),
632 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
633 if (!s) {
634 if (last1 >= last2) {
635 DEBUG_r(PerlIO_printf(Perl_debug_log,
636 ", giving up...\n"));
637 goto fail_finish;
638 }
639 DEBUG_r(PerlIO_printf(Perl_debug_log,
640 ", trying floating at offset %ld...\n",
1aa99e6b
IH
641 (long)(HOP3c(s1, 1, strend) - i_strpos)));
642 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
643 s = HOP3c(last, 1, strend);
2c2d71f5
JH
644 goto restart;
645 }
646 else {
647 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 648 (long)(s - i_strpos)));
1aa99e6b
IH
649 t = HOP3c(s, -prog->anchored_offset, strbeg);
650 other_last = HOP3c(s, 1, strend);
30944b6d 651 s = s1;
2c2d71f5
JH
652 if (t == strpos)
653 goto try_at_start;
2c2d71f5
JH
654 goto try_at_offset;
655 }
30944b6d 656 }
2c2d71f5
JH
657 }
658 else { /* Take into account the floating substring. */
33b8afdf
JH
659 char *last, *last1;
660 char *s1 = s;
661 SV* must;
662
663 t = HOP3c(s, -start_shift, strbeg);
664 last1 = last =
665 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
666 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
667 last = HOP3c(t, prog->float_max_offset, strend);
668 s = HOP3c(t, prog->float_min_offset, strend);
669 if (s < other_last)
670 s = other_last;
2c2d71f5 671 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
672 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
673 /* fbm_instr() takes into account exact value of end-of-str
674 if the check is SvTAIL(ed). Since false positives are OK,
675 and end-of-str is not later than strend we are OK. */
676 if (must == &PL_sv_undef) {
677 s = (char*)NULL;
678 DEBUG_r(must = prog->float_utf8); /* for debug message */
679 }
680 else
2c2d71f5 681 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
682 (unsigned char*)last + SvCUR(must)
683 - (SvTAIL(must)!=0),
684 must, PL_multiline ? FBMrf_MULTILINE : 0);
685 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
686 (s ? "Found" : "Contradicts"),
687 PL_colors[0],
688 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
689 SvPVX(must),
690 PL_colors[1], (SvTAIL(must) ? "$" : "")));
691 if (!s) {
692 if (last1 == last) {
2c2d71f5 693 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
694 ", giving up...\n"));
695 goto fail_finish;
2c2d71f5 696 }
33b8afdf
JH
697 DEBUG_r(PerlIO_printf(Perl_debug_log,
698 ", trying anchored starting at offset %ld...\n",
699 (long)(s1 + 1 - i_strpos)));
700 other_last = last;
701 s = HOP3c(t, 1, strend);
702 goto restart;
703 }
704 else {
705 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
706 (long)(s - i_strpos)));
707 other_last = s; /* Fix this later. --Hugo */
708 s = s1;
709 if (t == strpos)
710 goto try_at_start;
711 goto try_at_offset;
712 }
2c2d71f5 713 }
cad2e5aa 714 }
2c2d71f5
JH
715
716 t = s - prog->check_offset_max;
2c2d71f5 717 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 718 && (!do_utf8
1aa99e6b
IH
719 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
720 && t > strpos))) {
2c2d71f5
JH
721 /* Fixed substring is found far enough so that the match
722 cannot start at strpos. */
723 try_at_offset:
cad2e5aa 724 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
725 /* Eventually fbm_*() should handle this, but often
726 anchored_offset is not 0, so this check will not be wasted. */
727 /* XXXX In the code below we prefer to look for "^" even in
728 presence of anchored substrings. And we search even
729 beyond the found float position. These pessimizations
730 are historical artefacts only. */
731 find_anchor:
2c2d71f5 732 while (t < strend - prog->minlen) {
cad2e5aa 733 if (*t == '\n') {
4ee3650e 734 if (t < check_at - prog->check_offset_min) {
33b8afdf 735 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
736 /* Since we moved from the found position,
737 we definitely contradict the found anchored
30944b6d
IZ
738 substr. Due to the above check we do not
739 contradict "check" substr.
740 Thus we can arrive here only if check substr
741 is float. Redo checking for "other"=="fixed".
742 */
9041c2e3 743 strpos = t + 1;
30944b6d
IZ
744 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
745 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
746 goto do_other_anchored;
747 }
4ee3650e
GS
748 /* We don't contradict the found floating substring. */
749 /* XXXX Why not check for STCLASS? */
cad2e5aa 750 s = t + 1;
2c2d71f5 751 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 752 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
753 goto set_useful;
754 }
4ee3650e
GS
755 /* Position contradicts check-string */
756 /* XXXX probably better to look for check-string
757 than for "\n", so one should lower the limit for t? */
758 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 759 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 760 other_last = strpos = s = t + 1;
cad2e5aa
JH
761 goto restart;
762 }
763 t++;
764 }
2c2d71f5
JH
765 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
766 PL_colors[0],PL_colors[1]));
767 goto fail_finish;
cad2e5aa 768 }
f5952150
GS
769 else {
770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
771 PL_colors[0],PL_colors[1]));
772 }
cad2e5aa
JH
773 s = t;
774 set_useful:
33b8afdf 775 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
776 }
777 else {
f5952150 778 /* The found string does not prohibit matching at strpos,
2c2d71f5 779 - no optimization of calling REx engine can be performed,
f5952150
GS
780 unless it was an MBOL and we are not after MBOL,
781 or a future STCLASS check will fail this. */
2c2d71f5
JH
782 try_at_start:
783 /* Even in this situation we may use MBOL flag if strpos is offset
784 wrt the start of the string. */
05b4157f 785 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 786 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
787 /* May be due to an implicit anchor of m{.*foo} */
788 && !(prog->reganch & ROPT_IMPLICIT))
789 {
cad2e5aa
JH
790 t = strpos;
791 goto find_anchor;
792 }
30944b6d 793 DEBUG_r( if (ml_anch)
f5952150
GS
794 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
795 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 796 );
2c2d71f5 797 success_at_start:
30944b6d 798 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
799 && (do_utf8 ? (
800 prog->check_utf8 /* Could be deleted already */
801 && --BmUSEFUL(prog->check_utf8) < 0
802 && (prog->check_utf8 == prog->float_utf8)
803 ) : (
804 prog->check_substr /* Could be deleted already */
805 && --BmUSEFUL(prog->check_substr) < 0
806 && (prog->check_substr == prog->float_substr)
807 )))
66e933ab 808 {
cad2e5aa 809 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 810 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
811 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
812 if (do_utf8 ? prog->check_substr : prog->check_utf8)
813 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
814 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
815 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 816 check = Nullsv; /* abort */
cad2e5aa 817 s = strpos;
3cf5c195
IZ
818 /* XXXX This is a remnant of the old implementation. It
819 looks wasteful, since now INTUIT can use many
6eb5f6b9 820 other heuristics. */
cad2e5aa
JH
821 prog->reganch &= ~RE_USE_INTUIT;
822 }
823 else
824 s = strpos;
825 }
826
6eb5f6b9
JH
827 /* Last resort... */
828 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
829 if (prog->regstclass) {
830 /* minlen == 0 is possible if regstclass is \b or \B,
831 and the fixed substr is ''$.
832 Since minlen is already taken into account, s+1 is before strend;
833 accidentally, minlen >= 1 guaranties no false positives at s + 1
834 even for \b or \B. But (minlen? 1 : 0) below assumes that
835 regstclass does not come from lookahead... */
836 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
837 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 838 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 839 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 840 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 841 : 1);
33b8afdf 842 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 843 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 844 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
845 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
846 cl_l, strend)
847 : strend);
a1933d95 848 char *startpos = strbeg;
6eb5f6b9
JH
849
850 t = s;
76384e4a 851 if (prog->reganch & ROPT_UTF8) {
ffc61ed2 852 PL_regdata = prog->data;
76384e4a
GS
853 PL_bostr = startpos;
854 }
f33976b4 855 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
856 if (!s) {
857#ifdef DEBUGGING
b7953727 858 char *what = 0;
6eb5f6b9
JH
859#endif
860 if (endpos == strend) {
861 DEBUG_r( PerlIO_printf(Perl_debug_log,
862 "Could not match STCLASS...\n") );
863 goto fail;
864 }
66e933ab
GS
865 DEBUG_r( PerlIO_printf(Perl_debug_log,
866 "This position contradicts STCLASS...\n") );
653099ff
GS
867 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
868 goto fail;
6eb5f6b9 869 /* Contradict one of substrings */
33b8afdf
JH
870 if (prog->anchored_substr || prog->anchored_utf8) {
871 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
872 DEBUG_r( what = "anchored" );
873 hop_and_restart:
1aa99e6b 874 s = HOP3c(t, 1, strend);
66e933ab
GS
875 if (s + start_shift + end_shift > strend) {
876 /* XXXX Should be taken into account earlier? */
877 DEBUG_r( PerlIO_printf(Perl_debug_log,
878 "Could not match STCLASS...\n") );
879 goto fail;
880 }
5e39e1e5
HS
881 if (!check)
882 goto giveup;
6eb5f6b9 883 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 884 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
885 what, (long)(s + start_shift - i_strpos)) );
886 goto restart;
887 }
66e933ab 888 /* Have both, check_string is floating */
6eb5f6b9
JH
889 if (t + start_shift >= check_at) /* Contradicts floating=check */
890 goto retry_floating_check;
891 /* Recheck anchored substring, but not floating... */
9041c2e3 892 s = check_at;
5e39e1e5
HS
893 if (!check)
894 goto giveup;
6eb5f6b9 895 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 896 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
897 (long)(other_last - i_strpos)) );
898 goto do_other_anchored;
899 }
60e71179
GS
900 /* Another way we could have checked stclass at the
901 current position only: */
902 if (ml_anch) {
903 s = t = t + 1;
5e39e1e5
HS
904 if (!check)
905 goto giveup;
60e71179 906 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
907 "Looking for /%s^%s/m starting at offset %ld...\n",
908 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 909 goto try_at_offset;
66e933ab 910 }
33b8afdf 911 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 912 goto fail;
6eb5f6b9
JH
913 /* Check is floating subtring. */
914 retry_floating_check:
915 t = check_at - start_shift;
916 DEBUG_r( what = "floating" );
917 goto hop_and_restart;
918 }
b7953727
JH
919 if (t != s) {
920 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 921 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
922 (long)(t - i_strpos), (long)(s - i_strpos))
923 );
924 }
925 else {
926 DEBUG_r(PerlIO_printf(Perl_debug_log,
927 "Does not contradict STCLASS...\n");
928 );
929 }
6eb5f6b9 930 }
5e39e1e5
HS
931 giveup:
932 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
933 PL_colors[4], (check ? "Guessed" : "Giving up"),
934 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 935 return s;
2c2d71f5
JH
936
937 fail_finish: /* Substring not found */
33b8afdf
JH
938 if (prog->check_substr || prog->check_utf8) /* could be removed already */
939 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 940 fail:
2c2d71f5 941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
942 PL_colors[4],PL_colors[5]));
943 return Nullch;
944}
9661b544 945
6eb5f6b9 946/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
947STATIC char *
948S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 949{
6eb5f6b9
JH
950 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
951 char *m;
d8093b23
G
952 STRLEN ln;
953 unsigned int c1;
954 unsigned int c2;
6eb5f6b9
JH
955 char *e;
956 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 957 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 958
6eb5f6b9
JH
959 /* We know what class it must start with. */
960 switch (OP(c)) {
6eb5f6b9 961 case ANYOF:
a0ed51b3 962 while (s < strend) {
7d3e948e
JH
963 STRLEN skip = do_utf8 ? UTF8SKIP(s) : 1;
964
965 if (do_utf8 ?
ba7b4546 966 reginclass(c, (U8*)s, 0, do_utf8) :
7d3e948e 967 REGINCLASS(c, (U8*)s) ||
ebc501f0
JH
968 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
969 /* The assignment of 2 is intentional:
970 * for the sharp s, the skip is 2. */
971 (skip = SHARP_S_SKIP)
972 )) {
6eb5f6b9
JH
973 if (tmp && (norun || regtry(prog, s)))
974 goto got_it;
975 else
976 tmp = doevery;
a0ed51b3 977 }
e0f9d4a8 978 else
7d3e948e 979 tmp = 1;
e0f9d4a8 980 s += skip;
a0d0e21e 981 }
6eb5f6b9 982 break;
f33976b4
DB
983 case CANY:
984 while (s < strend) {
985 if (tmp && (norun || regtry(prog, s)))
986 goto got_it;
987 else
988 tmp = doevery;
989 s++;
990 }
991 break;
6eb5f6b9
JH
992 case EXACTF:
993 m = STRING(c);
994 ln = STR_LEN(c);
1aa99e6b 995 if (UTF) {
a2a2844f 996 STRLEN ulen1, ulen2;
e7ae6809
JH
997 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
998 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
999
1000 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1001 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1002
872c91ae
JH
1003 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1004 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1005 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1006 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1007 }
1008 else {
1009 c1 = *(U8*)m;
1010 c2 = PL_fold[c1];
1011 }
6eb5f6b9
JH
1012 goto do_exactf;
1013 case EXACTFL:
1014 m = STRING(c);
1015 ln = STR_LEN(c);
d8093b23 1016 c1 = *(U8*)m;
6eb5f6b9
JH
1017 c2 = PL_fold_locale[c1];
1018 do_exactf:
6795a346 1019 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1020
6eb5f6b9
JH
1021 if (norun && e < s)
1022 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1023
60a8b682
JH
1024 /* The idea in the EXACTF* cases is to first find the
1025 * first character of the EXACTF* node and then, if
1026 * necessary, case-insensitively compare the full
1027 * text of the node. The c1 and c2 are the first
1028 * characters (though in Unicode it gets a bit
1029 * more complicated because there are more cases
7f16dd3d
JH
1030 * than just upper and lower: one needs to use
1031 * the so-called folding case for case-insensitive
1032 * matching (called "loose matching" in Unicode).
1033 * ibcmp_utf8() will do just that. */
60a8b682 1034
1aa99e6b 1035 if (do_utf8) {
575cac57
JH
1036 UV c, f;
1037 U8 tmpbuf [UTF8_MAXLEN+1];
1038 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1039 STRLEN len, foldlen;
d7f013c8 1040
09091399 1041 if (c1 == c2) {
1aa99e6b 1042 while (s <= e) {
872c91ae
JH
1043 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1044 ckWARN(WARN_UTF8) ?
1045 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1046 if ( c == c1
1047 && (ln == len ||
66423254 1048 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1049 m, (char **)0, ln, (bool)UTF))
55da9344 1050 && (norun || regtry(prog, s)) )
1aa99e6b 1051 goto got_it;
80aecb99
JH
1052 else {
1053 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1054 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1055 if ( f != c
1056 && (f == c1 || f == c2)
1057 && (ln == foldlen ||
66423254
JH
1058 !ibcmp_utf8((char *) foldbuf,
1059 (char **)0, foldlen, do_utf8,
d07ddd77 1060 m,
eb160463 1061 (char **)0, ln, (bool)UTF))
80aecb99
JH
1062 && (norun || regtry(prog, s)) )
1063 goto got_it;
1064 }
1aa99e6b
IH
1065 s += len;
1066 }
09091399
JH
1067 }
1068 else {
1aa99e6b 1069 while (s <= e) {
872c91ae
JH
1070 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1071 ckWARN(WARN_UTF8) ?
1072 0 : UTF8_ALLOW_ANY);
80aecb99 1073
60a8b682 1074 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1075 * Note that not all the possible combinations
1076 * are handled here: some of them are handled
1077 * by the standard folding rules, and some of
1078 * them (the character class or ANYOF cases)
1079 * are handled during compiletime in
1080 * regexec.c:S_regclass(). */
880bd946
JH
1081 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1082 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1083 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1084
1085 if ( (c == c1 || c == c2)
1086 && (ln == len ||
66423254 1087 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1088 m, (char **)0, ln, (bool)UTF))
55da9344 1089 && (norun || regtry(prog, s)) )
1aa99e6b 1090 goto got_it;
80aecb99
JH
1091 else {
1092 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1093 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1094 if ( f != c
1095 && (f == c1 || f == c2)
1096 && (ln == foldlen ||
a6872d42 1097 !ibcmp_utf8((char *) foldbuf,
66423254 1098 (char **)0, foldlen, do_utf8,
d07ddd77 1099 m,
eb160463 1100 (char **)0, ln, (bool)UTF))
80aecb99
JH
1101 && (norun || regtry(prog, s)) )
1102 goto got_it;
1103 }
1aa99e6b
IH
1104 s += len;
1105 }
09091399 1106 }
1aa99e6b
IH
1107 }
1108 else {
1109 if (c1 == c2)
1110 while (s <= e) {
1111 if ( *(U8*)s == c1
1112 && (ln == 1 || !(OP(c) == EXACTF
1113 ? ibcmp(s, m, ln)
1114 : ibcmp_locale(s, m, ln)))
1115 && (norun || regtry(prog, s)) )
1116 goto got_it;
1117 s++;
1118 }
1119 else
1120 while (s <= e) {
1121 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1122 && (ln == 1 || !(OP(c) == EXACTF
1123 ? ibcmp(s, m, ln)
1124 : ibcmp_locale(s, m, ln)))
1125 && (norun || regtry(prog, s)) )
1126 goto got_it;
1127 s++;
1128 }
b3c9acc1
IZ
1129 }
1130 break;
bbce6d69 1131 case BOUNDL:
3280af22 1132 PL_reg_flags |= RF_tainted;
bbce6d69 1133 /* FALL THROUGH */
a0d0e21e 1134 case BOUND:
ffc61ed2 1135 if (do_utf8) {
12d33761 1136 if (s == PL_bostr)
ffc61ed2
JH
1137 tmp = '\n';
1138 else {
b4f7163a 1139 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1140
b4f7163a 1141 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1142 }
1143 tmp = ((OP(c) == BOUND ?
9041c2e3 1144 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1145 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1146 while (s < strend) {
1147 if (tmp == !(OP(c) == BOUND ?
3568d838 1148 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1149 isALNUM_LC_utf8((U8*)s)))
1150 {
1151 tmp = !tmp;
1152 if ((norun || regtry(prog, s)))
1153 goto got_it;
1154 }
1155 s += UTF8SKIP(s);
a687059c 1156 }
a0d0e21e 1157 }
667bb95a 1158 else {
12d33761 1159 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1160 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1161 while (s < strend) {
1162 if (tmp ==
1163 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1164 tmp = !tmp;
1165 if ((norun || regtry(prog, s)))
1166 goto got_it;
1167 }
1168 s++;
a0ed51b3 1169 }
a0ed51b3 1170 }
6eb5f6b9 1171 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1172 goto got_it;
1173 break;
bbce6d69 1174 case NBOUNDL:
3280af22 1175 PL_reg_flags |= RF_tainted;
bbce6d69 1176 /* FALL THROUGH */
a0d0e21e 1177 case NBOUND:
ffc61ed2 1178 if (do_utf8) {
12d33761 1179 if (s == PL_bostr)
ffc61ed2
JH
1180 tmp = '\n';
1181 else {
b4f7163a 1182 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1183
b4f7163a 1184 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1185 }
1186 tmp = ((OP(c) == NBOUND ?
9041c2e3 1187 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1188 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1189 while (s < strend) {
1190 if (tmp == !(OP(c) == NBOUND ?
3568d838 1191 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1192 isALNUM_LC_utf8((U8*)s)))
1193 tmp = !tmp;
1194 else if ((norun || regtry(prog, s)))
1195 goto got_it;
1196 s += UTF8SKIP(s);
1197 }
a0d0e21e 1198 }
667bb95a 1199 else {
12d33761 1200 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1201 tmp = ((OP(c) == NBOUND ?
1202 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1203 while (s < strend) {
1204 if (tmp ==
1205 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1206 tmp = !tmp;
1207 else if ((norun || regtry(prog, s)))
1208 goto got_it;
1209 s++;
1210 }
a0ed51b3 1211 }
6eb5f6b9 1212 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1213 goto got_it;
1214 break;
a0d0e21e 1215 case ALNUM:
ffc61ed2 1216 if (do_utf8) {
8269fa76 1217 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1218 while (s < strend) {
3568d838 1219 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1220 if (tmp && (norun || regtry(prog, s)))
1221 goto got_it;
1222 else
1223 tmp = doevery;
1224 }
bbce6d69 1225 else
ffc61ed2
JH
1226 tmp = 1;
1227 s += UTF8SKIP(s);
bbce6d69 1228 }
bbce6d69 1229 }
ffc61ed2
JH
1230 else {
1231 while (s < strend) {
1232 if (isALNUM(*s)) {
1233 if (tmp && (norun || regtry(prog, s)))
1234 goto got_it;
1235 else
1236 tmp = doevery;
1237 }
a0ed51b3 1238 else
ffc61ed2
JH
1239 tmp = 1;
1240 s++;
a0ed51b3 1241 }
a0ed51b3
LW
1242 }
1243 break;
bbce6d69 1244 case ALNUML:
3280af22 1245 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1246 if (do_utf8) {
1247 while (s < strend) {
1248 if (isALNUM_LC_utf8((U8*)s)) {
1249 if (tmp && (norun || regtry(prog, s)))
1250 goto got_it;
1251 else
1252 tmp = doevery;
1253 }
a687059c 1254 else
ffc61ed2
JH
1255 tmp = 1;
1256 s += UTF8SKIP(s);
a0d0e21e 1257 }
a0d0e21e 1258 }
ffc61ed2
JH
1259 else {
1260 while (s < strend) {
1261 if (isALNUM_LC(*s)) {
1262 if (tmp && (norun || regtry(prog, s)))
1263 goto got_it;
1264 else
1265 tmp = doevery;
1266 }
a0ed51b3 1267 else
ffc61ed2
JH
1268 tmp = 1;
1269 s++;
a0ed51b3 1270 }
a0ed51b3
LW
1271 }
1272 break;
a0d0e21e 1273 case NALNUM:
ffc61ed2 1274 if (do_utf8) {
8269fa76 1275 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1276 while (s < strend) {
3568d838 1277 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1278 if (tmp && (norun || regtry(prog, s)))
1279 goto got_it;
1280 else
1281 tmp = doevery;
1282 }
bbce6d69 1283 else
ffc61ed2
JH
1284 tmp = 1;
1285 s += UTF8SKIP(s);
bbce6d69 1286 }
bbce6d69 1287 }
ffc61ed2
JH
1288 else {
1289 while (s < strend) {
1290 if (!isALNUM(*s)) {
1291 if (tmp && (norun || regtry(prog, s)))
1292 goto got_it;
1293 else
1294 tmp = doevery;
1295 }
a0ed51b3 1296 else
ffc61ed2
JH
1297 tmp = 1;
1298 s++;
a0ed51b3 1299 }
a0ed51b3
LW
1300 }
1301 break;
bbce6d69 1302 case NALNUML:
3280af22 1303 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1304 if (do_utf8) {
1305 while (s < strend) {
1306 if (!isALNUM_LC_utf8((U8*)s)) {
1307 if (tmp && (norun || regtry(prog, s)))
1308 goto got_it;
1309 else
1310 tmp = doevery;
1311 }
a687059c 1312 else
ffc61ed2
JH
1313 tmp = 1;
1314 s += UTF8SKIP(s);
a687059c 1315 }
a0d0e21e 1316 }
ffc61ed2
JH
1317 else {
1318 while (s < strend) {
1319 if (!isALNUM_LC(*s)) {
1320 if (tmp && (norun || regtry(prog, s)))
1321 goto got_it;
1322 else
1323 tmp = doevery;
1324 }
a0ed51b3 1325 else
ffc61ed2
JH
1326 tmp = 1;
1327 s++;
a0ed51b3 1328 }
a0ed51b3
LW
1329 }
1330 break;
a0d0e21e 1331 case SPACE:
ffc61ed2 1332 if (do_utf8) {
8269fa76 1333 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1334 while (s < strend) {
3568d838 1335 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1336 if (tmp && (norun || regtry(prog, s)))
1337 goto got_it;
1338 else
1339 tmp = doevery;
1340 }
a0d0e21e 1341 else
ffc61ed2
JH
1342 tmp = 1;
1343 s += UTF8SKIP(s);
2304df62 1344 }
a0d0e21e 1345 }
ffc61ed2
JH
1346 else {
1347 while (s < strend) {
1348 if (isSPACE(*s)) {
1349 if (tmp && (norun || regtry(prog, s)))
1350 goto got_it;
1351 else
1352 tmp = doevery;
1353 }
a0ed51b3 1354 else
ffc61ed2
JH
1355 tmp = 1;
1356 s++;
a0ed51b3 1357 }
a0ed51b3
LW
1358 }
1359 break;
bbce6d69 1360 case SPACEL:
3280af22 1361 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1362 if (do_utf8) {
1363 while (s < strend) {
1364 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1365 if (tmp && (norun || regtry(prog, s)))
1366 goto got_it;
1367 else
1368 tmp = doevery;
1369 }
bbce6d69 1370 else
ffc61ed2
JH
1371 tmp = 1;
1372 s += UTF8SKIP(s);
bbce6d69 1373 }
bbce6d69 1374 }
ffc61ed2
JH
1375 else {
1376 while (s < strend) {
1377 if (isSPACE_LC(*s)) {
1378 if (tmp && (norun || regtry(prog, s)))
1379 goto got_it;
1380 else
1381 tmp = doevery;
1382 }
a0ed51b3 1383 else
ffc61ed2
JH
1384 tmp = 1;
1385 s++;
a0ed51b3 1386 }
a0ed51b3
LW
1387 }
1388 break;
a0d0e21e 1389 case NSPACE:
ffc61ed2 1390 if (do_utf8) {
8269fa76 1391 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1392 while (s < strend) {
3568d838 1393 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1394 if (tmp && (norun || regtry(prog, s)))
1395 goto got_it;
1396 else
1397 tmp = doevery;
1398 }
a0d0e21e 1399 else
ffc61ed2
JH
1400 tmp = 1;
1401 s += UTF8SKIP(s);
a687059c 1402 }
a0d0e21e 1403 }
ffc61ed2
JH
1404 else {
1405 while (s < strend) {
1406 if (!isSPACE(*s)) {
1407 if (tmp && (norun || regtry(prog, s)))
1408 goto got_it;
1409 else
1410 tmp = doevery;
1411 }
a0ed51b3 1412 else
ffc61ed2
JH
1413 tmp = 1;
1414 s++;
a0ed51b3 1415 }
a0ed51b3
LW
1416 }
1417 break;
bbce6d69 1418 case NSPACEL:
3280af22 1419 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1420 if (do_utf8) {
1421 while (s < strend) {
1422 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1423 if (tmp && (norun || regtry(prog, s)))
1424 goto got_it;
1425 else
1426 tmp = doevery;
1427 }
bbce6d69 1428 else
ffc61ed2
JH
1429 tmp = 1;
1430 s += UTF8SKIP(s);
bbce6d69 1431 }
bbce6d69 1432 }
ffc61ed2
JH
1433 else {
1434 while (s < strend) {
1435 if (!isSPACE_LC(*s)) {
1436 if (tmp && (norun || regtry(prog, s)))
1437 goto got_it;
1438 else
1439 tmp = doevery;
1440 }
a0ed51b3 1441 else
ffc61ed2
JH
1442 tmp = 1;
1443 s++;
a0ed51b3 1444 }
a0ed51b3
LW
1445 }
1446 break;
a0d0e21e 1447 case DIGIT:
ffc61ed2 1448 if (do_utf8) {
8269fa76 1449 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1450 while (s < strend) {
3568d838 1451 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1452 if (tmp && (norun || regtry(prog, s)))
1453 goto got_it;
1454 else
1455 tmp = doevery;
1456 }
a0d0e21e 1457 else
ffc61ed2
JH
1458 tmp = 1;
1459 s += UTF8SKIP(s);
2b69d0c2 1460 }
a0d0e21e 1461 }
ffc61ed2
JH
1462 else {
1463 while (s < strend) {
1464 if (isDIGIT(*s)) {
1465 if (tmp && (norun || regtry(prog, s)))
1466 goto got_it;
1467 else
1468 tmp = doevery;
1469 }
a0ed51b3 1470 else
ffc61ed2
JH
1471 tmp = 1;
1472 s++;
a0ed51b3 1473 }
a0ed51b3
LW
1474 }
1475 break;
b8c5462f
JH
1476 case DIGITL:
1477 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1478 if (do_utf8) {
1479 while (s < strend) {
1480 if (isDIGIT_LC_utf8((U8*)s)) {
1481 if (tmp && (norun || regtry(prog, s)))
1482 goto got_it;
1483 else
1484 tmp = doevery;
1485 }
b8c5462f 1486 else
ffc61ed2
JH
1487 tmp = 1;
1488 s += UTF8SKIP(s);
b8c5462f 1489 }
b8c5462f 1490 }
ffc61ed2
JH
1491 else {
1492 while (s < strend) {
1493 if (isDIGIT_LC(*s)) {
1494 if (tmp && (norun || regtry(prog, s)))
1495 goto got_it;
1496 else
1497 tmp = doevery;
1498 }
b8c5462f 1499 else
ffc61ed2
JH
1500 tmp = 1;
1501 s++;
b8c5462f 1502 }
b8c5462f
JH
1503 }
1504 break;
a0d0e21e 1505 case NDIGIT:
ffc61ed2 1506 if (do_utf8) {
8269fa76 1507 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1508 while (s < strend) {
3568d838 1509 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1510 if (tmp && (norun || regtry(prog, s)))
1511 goto got_it;
1512 else
1513 tmp = doevery;
1514 }
a0d0e21e 1515 else
ffc61ed2
JH
1516 tmp = 1;
1517 s += UTF8SKIP(s);
a687059c 1518 }
a0d0e21e 1519 }
ffc61ed2
JH
1520 else {
1521 while (s < strend) {
1522 if (!isDIGIT(*s)) {
1523 if (tmp && (norun || regtry(prog, s)))
1524 goto got_it;
1525 else
1526 tmp = doevery;
1527 }
a0ed51b3 1528 else
ffc61ed2
JH
1529 tmp = 1;
1530 s++;
a0ed51b3 1531 }
a0ed51b3
LW
1532 }
1533 break;
b8c5462f
JH
1534 case NDIGITL:
1535 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1536 if (do_utf8) {
1537 while (s < strend) {
1538 if (!isDIGIT_LC_utf8((U8*)s)) {
1539 if (tmp && (norun || regtry(prog, s)))
1540 goto got_it;
1541 else
1542 tmp = doevery;
1543 }
b8c5462f 1544 else
ffc61ed2
JH
1545 tmp = 1;
1546 s += UTF8SKIP(s);
b8c5462f 1547 }
a0ed51b3 1548 }
ffc61ed2
JH
1549 else {
1550 while (s < strend) {
1551 if (!isDIGIT_LC(*s)) {
1552 if (tmp && (norun || regtry(prog, s)))
1553 goto got_it;
1554 else
1555 tmp = doevery;
1556 }
cf93c79d 1557 else
ffc61ed2
JH
1558 tmp = 1;
1559 s++;
b8c5462f 1560 }
b8c5462f
JH
1561 }
1562 break;
b3c9acc1 1563 default:
3c3eec57
GS
1564 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1565 break;
d6a28714 1566 }
6eb5f6b9
JH
1567 return 0;
1568 got_it:
1569 return s;
1570}
1571
1572/*
1573 - regexec_flags - match a regexp against a string
1574 */
1575I32
1576Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1577 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1578/* strend: pointer to null at end of string */
1579/* strbeg: real beginning of string */
1580/* minend: end of match must be >=minend after stringarg. */
1581/* data: May be used for some additional optimizations. */
1582/* nosave: For optimizations. */
1583{
6eb5f6b9
JH
1584 register char *s;
1585 register regnode *c;
1586 register char *startpos = stringarg;
6eb5f6b9
JH
1587 I32 minlen; /* must match at least this many chars */
1588 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1589 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1590 constant substr. */ /* CC */
1591 I32 end_shift = 0; /* Same for the end. */ /* CC */
1592 I32 scream_pos = -1; /* Internal iterator of scream. */
1593 char *scream_olds;
1594 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1595 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1596#ifdef DEBUGGING
9e55ce06
JH
1597 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1598 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1599#endif
6eb5f6b9
JH
1600
1601 PL_regcc = 0;
1602
1603 cache_re(prog);
1604#ifdef DEBUGGING
aea4f609 1605 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1606#endif
1607
1608 /* Be paranoid... */
1609 if (prog == NULL || startpos == NULL) {
1610 Perl_croak(aTHX_ "NULL regexp parameter");
1611 return 0;
1612 }
1613
1614 minlen = prog->minlen;
61a36c01 1615 if (strend - startpos < minlen) {
a72c7584
JH
1616 DEBUG_r(PerlIO_printf(Perl_debug_log,
1617 "String too short [regexec_flags]...\n"));
1618 goto phooey;
1aa99e6b 1619 }
6eb5f6b9 1620
6eb5f6b9
JH
1621 /* Check validity of program. */
1622 if (UCHARAT(prog->program) != REG_MAGIC) {
1623 Perl_croak(aTHX_ "corrupted regexp program");
1624 }
1625
1626 PL_reg_flags = 0;
1627 PL_reg_eval_set = 0;
1628 PL_reg_maxiter = 0;
1629
1630 if (prog->reganch & ROPT_UTF8)
1631 PL_reg_flags |= RF_utf8;
1632
1633 /* Mark beginning of line for ^ and lookbehind. */
1634 PL_regbol = startpos;
1635 PL_bostr = strbeg;
1636 PL_reg_sv = sv;
1637
1638 /* Mark end of line for $ (and such) */
1639 PL_regeol = strend;
1640
1641 /* see how far we have to get to not match where we matched before */
1642 PL_regtill = startpos+minend;
1643
1644 /* We start without call_cc context. */
1645 PL_reg_call_cc = 0;
1646
1647 /* If there is a "must appear" string, look for it. */
1648 s = startpos;
1649
1650 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1651 MAGIC *mg;
1652
1653 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1654 PL_reg_ganch = startpos;
1655 else if (sv && SvTYPE(sv) >= SVt_PVMG
1656 && SvMAGIC(sv)
14befaf4
DM
1657 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1658 && mg->mg_len >= 0) {
6eb5f6b9
JH
1659 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1660 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1661 if (s > PL_reg_ganch)
6eb5f6b9
JH
1662 goto phooey;
1663 s = PL_reg_ganch;
1664 }
1665 }
1666 else /* pos() not defined */
1667 PL_reg_ganch = strbeg;
1668 }
1669
33b8afdf 1670 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1671 re_scream_pos_data d;
1672
1673 d.scream_olds = &scream_olds;
1674 d.scream_pos = &scream_pos;
1675 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1676 if (!s) {
1677 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1678 goto phooey; /* not present */
3fa9c3d7 1679 }
6eb5f6b9
JH
1680 }
1681
2a782b5b 1682 DEBUG_r({
9e55ce06
JH
1683 char *s0 = UTF ?
1684 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1685 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1686 prog->precomp;
1687 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1688 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1689 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1690 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1691 if (!PL_colorset)
1692 reginitcolors();
1693 PerlIO_printf(Perl_debug_log,
9e55ce06 1694 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1695 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1696 len0, len0, s0,
2a782b5b 1697 PL_colors[1],
9e55ce06 1698 len0 > 60 ? "..." : "",
2a782b5b 1699 PL_colors[0],
9e55ce06
JH
1700 (int)(len1 > 60 ? 60 : len1),
1701 s1, PL_colors[1],
1702 (len1 > 60 ? "..." : "")
2a782b5b
JH
1703 );
1704 });
6eb5f6b9
JH
1705
1706 /* Simplest case: anchored match need be tried only once. */
1707 /* [unless only anchor is BOL and multiline is set] */
1708 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1709 if (s == startpos && regtry(prog, startpos))
1710 goto got_it;
1711 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1712 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1713 {
1714 char *end;
1715
1716 if (minlen)
1717 dontbother = minlen - 1;
1aa99e6b 1718 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1719 /* for multiline we only have to try after newlines */
33b8afdf 1720 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1721 if (s == startpos)
1722 goto after_try;
1723 while (1) {
1724 if (regtry(prog, s))
1725 goto got_it;
1726 after_try:
1727 if (s >= end)
1728 goto phooey;
1729 if (prog->reganch & RE_USE_INTUIT) {
1730 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1731 if (!s)
1732 goto phooey;
1733 }
1734 else
1735 s++;
1736 }
1737 } else {
1738 if (s > startpos)
1739 s--;
1740 while (s < end) {
1741 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1742 if (regtry(prog, s))
1743 goto got_it;
1744 }
1745 }
1746 }
1747 }
1748 goto phooey;
1749 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1750 if (regtry(prog, PL_reg_ganch))
1751 goto got_it;
1752 goto phooey;
1753 }
1754
1755 /* Messy cases: unanchored match. */
33b8afdf 1756 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1757 /* we have /x+whatever/ */
1758 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1759 char ch;
bf93d4cc
GS
1760#ifdef DEBUGGING
1761 int did_match = 0;
1762#endif
33b8afdf
JH
1763 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1764 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1765 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1766
1aa99e6b 1767 if (do_utf8) {
6eb5f6b9
JH
1768 while (s < strend) {
1769 if (*s == ch) {
bf93d4cc 1770 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1771 if (regtry(prog, s)) goto got_it;
1772 s += UTF8SKIP(s);
1773 while (s < strend && *s == ch)
1774 s += UTF8SKIP(s);
1775 }
1776 s += UTF8SKIP(s);
1777 }
1778 }
1779 else {
1780 while (s < strend) {
1781 if (*s == ch) {
bf93d4cc 1782 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1783 if (regtry(prog, s)) goto got_it;
1784 s++;
1785 while (s < strend && *s == ch)
1786 s++;
1787 }
1788 s++;
1789 }
1790 }
b7953727 1791 DEBUG_r(if (!did_match)
bf93d4cc 1792 PerlIO_printf(Perl_debug_log,
b7953727
JH
1793 "Did not find anchored character...\n")
1794 );
6eb5f6b9
JH
1795 }
1796 /*SUPPRESS 560*/
33b8afdf
JH
1797 else if (prog->anchored_substr != Nullsv
1798 || prog->anchored_utf8 != Nullsv
1799 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1800 && prog->float_max_offset < strend - s)) {
1801 SV *must;
1802 I32 back_max;
1803 I32 back_min;
1804 char *last;
6eb5f6b9 1805 char *last1; /* Last position checked before */
bf93d4cc
GS
1806#ifdef DEBUGGING
1807 int did_match = 0;
1808#endif
33b8afdf
JH
1809 if (prog->anchored_substr || prog->anchored_utf8) {
1810 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1811 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1812 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1813 back_max = back_min = prog->anchored_offset;
1814 } else {
1815 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1816 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1817 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1818 back_max = prog->float_max_offset;
1819 back_min = prog->float_min_offset;
1820 }
1821 if (must == &PL_sv_undef)
1822 /* could not downgrade utf8 check substring, so must fail */
1823 goto phooey;
1824
1825 last = HOP3c(strend, /* Cannot start after this */
1826 -(I32)(CHR_SVLEN(must)
1827 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1828
1829 if (s > PL_bostr)
1830 last1 = HOPc(s, -1);
1831 else
1832 last1 = s - 1; /* bogus */
1833
1834 /* XXXX check_substr already used to find `s', can optimize if
1835 check_substr==must. */
1836 scream_pos = -1;
1837 dontbother = end_shift;
1838 strend = HOPc(strend, -dontbother);
1839 while ( (s <= last) &&
9041c2e3 1840 ((flags & REXEC_SCREAM)
1aa99e6b 1841 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1842 end_shift, &scream_pos, 0))
1aa99e6b 1843 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1844 (unsigned char*)strend, must,
6eb5f6b9 1845 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
bf93d4cc 1846 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1847 if (HOPc(s, -back_max) > last1) {
1848 last1 = HOPc(s, -back_min);
1849 s = HOPc(s, -back_max);
1850 }
1851 else {
1852 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1853
1854 last1 = HOPc(s, -back_min);
1855 s = t;
1856 }
1aa99e6b 1857 if (do_utf8) {
6eb5f6b9
JH
1858 while (s <= last1) {
1859 if (regtry(prog, s))
1860 goto got_it;
1861 s += UTF8SKIP(s);
1862 }
1863 }
1864 else {
1865 while (s <= last1) {
1866 if (regtry(prog, s))
1867 goto got_it;
1868 s++;
1869 }
1870 }
1871 }
b7953727
JH
1872 DEBUG_r(if (!did_match)
1873 PerlIO_printf(Perl_debug_log,
1874 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1875 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1876 ? "anchored" : "floating"),
1877 PL_colors[0],
1878 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1879 SvPVX(must),
b7953727
JH
1880 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1881 );
6eb5f6b9
JH
1882 goto phooey;
1883 }
155aba94 1884 else if ((c = prog->regstclass)) {
66e933ab
GS
1885 if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
1886 /* don't bother with what can't match */
6eb5f6b9 1887 strend = HOPc(strend, -(minlen - 1));
ffc61ed2
JH
1888 DEBUG_r({
1889 SV *prop = sv_newmortal();
9e55ce06
JH
1890 char *s0;
1891 char *s1;
1892 int len0;
1893 int len1;
1894
ffc61ed2 1895 regprop(prop, c);
9e55ce06
JH
1896 s0 = UTF ?
1897 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1898 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1899 SvPVX(prop);
1900 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1901 s1 = UTF ?
c728cb41 1902 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1903 len1 = UTF ? SvCUR(dsv1) : strend - s;
1904 PerlIO_printf(Perl_debug_log,
1905 "Matching stclass `%*.*s' against `%*.*s'\n",
1906 len0, len0, s0,
1907 len1, len1, s1);
ffc61ed2 1908 });
6eb5f6b9
JH
1909 if (find_byclass(prog, c, s, strend, startpos, 0))
1910 goto got_it;
bf93d4cc 1911 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1912 }
1913 else {
1914 dontbother = 0;
33b8afdf
JH
1915 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1916 /* Trim the end. */
d6a28714 1917 char *last;
33b8afdf
JH
1918 SV* float_real;
1919
1920 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1921 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1922 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1923
1924 if (flags & REXEC_SCREAM) {
33b8afdf 1925 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1926 end_shift, &scream_pos, 1); /* last one */
1927 if (!last)
ffc61ed2 1928 last = scream_olds; /* Only one occurrence. */
b8c5462f 1929 }
d6a28714
JH
1930 else {
1931 STRLEN len;
33b8afdf 1932 char *little = SvPV(float_real, len);
d6a28714 1933
33b8afdf 1934 if (SvTAIL(float_real)) {
d6a28714
JH
1935 if (memEQ(strend - len + 1, little, len - 1))
1936 last = strend - len + 1;
1937 else if (!PL_multiline)
9041c2e3 1938 last = memEQ(strend - len, little, len)
d6a28714 1939 ? strend - len : Nullch;
b8c5462f 1940 else
d6a28714
JH
1941 goto find_last;
1942 } else {
1943 find_last:
9041c2e3 1944 if (len)
d6a28714 1945 last = rninstr(s, strend, little, little + len);
b8c5462f 1946 else
d6a28714 1947 last = strend; /* matching `$' */
b8c5462f 1948 }
b8c5462f 1949 }
bf93d4cc
GS
1950 if (last == NULL) {
1951 DEBUG_r(PerlIO_printf(Perl_debug_log,
1952 "%sCan't trim the tail, match fails (should not happen)%s\n",
1953 PL_colors[4],PL_colors[5]));
1954 goto phooey; /* Should not happen! */
1955 }
d6a28714
JH
1956 dontbother = strend - last + prog->float_min_offset;
1957 }
1958 if (minlen && (dontbother < minlen))
1959 dontbother = minlen - 1;
1960 strend -= dontbother; /* this one's always in bytes! */
1961 /* We don't know much -- general case. */
1aa99e6b 1962 if (do_utf8) {
d6a28714
JH
1963 for (;;) {
1964 if (regtry(prog, s))
1965 goto got_it;
1966 if (s >= strend)
1967 break;
b8c5462f 1968 s += UTF8SKIP(s);
d6a28714
JH
1969 };
1970 }
1971 else {
1972 do {
1973 if (regtry(prog, s))
1974 goto got_it;
1975 } while (s++ < strend);
1976 }
1977 }
1978
1979 /* Failure. */
1980 goto phooey;
1981
1982got_it:
1983 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1984
1985 if (PL_reg_eval_set) {
1986 /* Preserve the current value of $^R */
1987 if (oreplsv != GvSV(PL_replgv))
1988 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1989 restored, the value remains
1990 the same. */
acfe0abc 1991 restore_pos(aTHX_ 0);
d6a28714
JH
1992 }
1993
1994 /* make sure $`, $&, $', and $digit will work later */
1995 if ( !(flags & REXEC_NOT_FIRST) ) {
1996 if (RX_MATCH_COPIED(prog)) {
1997 Safefree(prog->subbeg);
1998 RX_MATCH_COPIED_off(prog);
1999 }
2000 if (flags & REXEC_COPY_STR) {
2001 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2002
2003 s = savepvn(strbeg, i);
2004 prog->subbeg = s;
2005 prog->sublen = i;
2006 RX_MATCH_COPIED_on(prog);
2007 }
2008 else {
2009 prog->subbeg = strbeg;
2010 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2011 }
2012 }
9041c2e3 2013
d6a28714
JH
2014 return 1;
2015
2016phooey:
bf93d4cc
GS
2017 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2018 PL_colors[4],PL_colors[5]));
d6a28714 2019 if (PL_reg_eval_set)
acfe0abc 2020 restore_pos(aTHX_ 0);
d6a28714
JH
2021 return 0;
2022}
2023
2024/*
2025 - regtry - try match at specific point
2026 */
2027STATIC I32 /* 0 failure, 1 success */
2028S_regtry(pTHX_ regexp *prog, char *startpos)
2029{
d6a28714
JH
2030 register I32 i;
2031 register I32 *sp;
2032 register I32 *ep;
2033 CHECKPOINT lastcp;
2034
02db2b7b
IZ
2035#ifdef DEBUGGING
2036 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2037#endif
d6a28714
JH
2038 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2039 MAGIC *mg;
2040
2041 PL_reg_eval_set = RS_init;
2042 DEBUG_r(DEBUG_s(
b900a521
JH
2043 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2044 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2045 ));
e8347627 2046 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2047 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2048 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2049 SAVETMPS;
2050 /* Apparently this is not needed, judging by wantarray. */
e8347627 2051 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2052 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2053
2054 if (PL_reg_sv) {
2055 /* Make $_ available to executed code. */
2056 if (PL_reg_sv != DEFSV) {
4d1ff10f 2057 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2058 SAVESPTR(DEFSV);
2059 DEFSV = PL_reg_sv;
b8c5462f 2060 }
d6a28714 2061
9041c2e3 2062 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2063 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2064 /* prepare for quick setting of pos */
14befaf4
DM
2065 sv_magic(PL_reg_sv, (SV*)0,
2066 PERL_MAGIC_regex_global, Nullch, 0);
2067 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2068 mg->mg_len = -1;
b8c5462f 2069 }
d6a28714
JH
2070 PL_reg_magic = mg;
2071 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2072 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2073 }
09687e5a 2074 if (!PL_reg_curpm) {
0f79a09d 2075 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2076#ifdef USE_ITHREADS
2077 {
2078 SV* repointer = newSViv(0);
577e12cc 2079 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2080 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2081 av_push(PL_regex_padav,repointer);
2082 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2083 PL_regex_pad = AvARRAY(PL_regex_padav);
2084 }
2085#endif
2086 }
aaa362c4 2087 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2088 PL_reg_oldcurpm = PL_curpm;
2089 PL_curpm = PL_reg_curpm;
2090 if (RX_MATCH_COPIED(prog)) {
2091 /* Here is a serious problem: we cannot rewrite subbeg,
2092 since it may be needed if this match fails. Thus
2093 $` inside (?{}) could fail... */
2094 PL_reg_oldsaved = prog->subbeg;
2095 PL_reg_oldsavedlen = prog->sublen;
2096 RX_MATCH_COPIED_off(prog);
2097 }
2098 else
2099 PL_reg_oldsaved = Nullch;
2100 prog->subbeg = PL_bostr;
2101 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2102 }
2103 prog->startp[0] = startpos - PL_bostr;
2104 PL_reginput = startpos;
2105 PL_regstartp = prog->startp;
2106 PL_regendp = prog->endp;
2107 PL_reglastparen = &prog->lastparen;
a01268b5 2108 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714
JH
2109 prog->lastparen = 0;
2110 PL_regsize = 0;
2111 DEBUG_r(PL_reg_starttry = startpos);
2112 if (PL_reg_start_tmpl <= prog->nparens) {
2113 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2114 if(PL_reg_start_tmp)
2115 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2116 else
2117 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2118 }
2119
2120 /* XXXX What this code is doing here?!!! There should be no need
2121 to do this again and again, PL_reglastparen should take care of
3dd2943c 2122 this! --ilya*/
dafc8851
JH
2123
2124 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2125 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2126 * PL_reglastparen), is not needed at all by the test suite
2127 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2128 * enough, for building DynaLoader, or otherwise this
2129 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2130 * will happen. Meanwhile, this code *is* needed for the
2131 * above-mentioned test suite tests to succeed. The common theme
2132 * on those tests seems to be returning null fields from matches.
2133 * --jhi */
dafc8851 2134#if 1
d6a28714
JH
2135 sp = prog->startp;
2136 ep = prog->endp;
2137 if (prog->nparens) {
eb160463 2138 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2139 *++sp = -1;
2140 *++ep = -1;
2141 }
2142 }
dafc8851 2143#endif
02db2b7b 2144 REGCP_SET(lastcp);
d6a28714
JH
2145 if (regmatch(prog->program + 1)) {
2146 prog->endp[0] = PL_reginput - PL_bostr;
2147 return 1;
2148 }
02db2b7b 2149 REGCP_UNWIND(lastcp);
d6a28714
JH
2150 return 0;
2151}
2152
02db2b7b
IZ
2153#define RE_UNWIND_BRANCH 1
2154#define RE_UNWIND_BRANCHJ 2
2155
2156union re_unwind_t;
2157
2158typedef struct { /* XX: makes sense to enlarge it... */
2159 I32 type;
2160 I32 prev;
2161 CHECKPOINT lastcp;
2162} re_unwind_generic_t;
2163
2164typedef struct {
2165 I32 type;
2166 I32 prev;
2167 CHECKPOINT lastcp;
2168 I32 lastparen;
2169 regnode *next;
2170 char *locinput;
2171 I32 nextchr;
2172#ifdef DEBUGGING
2173 int regindent;
2174#endif
2175} re_unwind_branch_t;
2176
2177typedef union re_unwind_t {
2178 I32 type;
2179 re_unwind_generic_t generic;
2180 re_unwind_branch_t branch;
2181} re_unwind_t;
2182
8ba1375e
MJD
2183#define sayYES goto yes
2184#define sayNO goto no
e0f9d4a8 2185#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2186#define sayYES_FINAL goto yes_final
2187#define sayYES_LOUD goto yes_loud
2188#define sayNO_FINAL goto no_final
2189#define sayNO_SILENT goto do_no
2190#define saySAME(x) if (x) goto yes; else goto no
2191
2192#define REPORT_CODE_OFF 24
2193
d6a28714
JH
2194/*
2195 - regmatch - main matching routine
2196 *
2197 * Conceptually the strategy is simple: check to see whether the current
2198 * node matches, call self recursively to see whether the rest matches,
2199 * and then act accordingly. In practice we make some effort to avoid
2200 * recursion, in particular by going through "ordinary" nodes (that don't
2201 * need to know whether the rest of the match failed) by a loop instead of
2202 * by recursion.
2203 */
2204/* [lwall] I've hoisted the register declarations to the outer block in order to
2205 * maybe save a little bit of pushing and popping on the stack. It also takes
2206 * advantage of machines that use a register save mask on subroutine entry.
2207 */
2208STATIC I32 /* 0 failure, 1 success */
2209S_regmatch(pTHX_ regnode *prog)
2210{
d6a28714
JH
2211 register regnode *scan; /* Current node. */
2212 regnode *next; /* Next node. */
2213 regnode *inner; /* Next node in internal branch. */
2214 register I32 nextchr; /* renamed nextchr - nextchar colides with
2215 function of same name */
2216 register I32 n; /* no or next */
b7953727
JH
2217 register I32 ln = 0; /* len or last */
2218 register char *s = Nullch; /* operand or save */
d6a28714 2219 register char *locinput = PL_reginput;
b7953727 2220 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2221 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2222 I32 unwind = 0;
b7953727 2223#if 0
02db2b7b 2224 I32 firstcp = PL_savestack_ix;
b7953727 2225#endif
53c4c00c 2226 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2227#ifdef DEBUGGING
ce333219
JH
2228 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2229 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2230 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2231#endif
02db2b7b 2232
d6a28714
JH
2233#ifdef DEBUGGING
2234 PL_regindent++;
2235#endif
2236
2237 /* Note that nextchr is a byte even in UTF */
2238 nextchr = UCHARAT(locinput);
2239 scan = prog;
2240 while (scan != NULL) {
8ba1375e 2241
2a782b5b 2242 DEBUG_r( {
d6a28714
JH
2243 SV *prop = sv_newmortal();
2244 int docolor = *PL_colors[0];
2245 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2246 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2247 /* The part of the string before starttry has one color
2248 (pref0_len chars), between starttry and current
2249 position another one (pref_len - pref0_len chars),
2250 after the current position the third one.
2251 We assume that pref0_len <= pref_len, otherwise we
2252 decrease pref0_len. */
9041c2e3 2253 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2254 ? (5 + taill) - l : locinput - PL_bostr;
2255 int pref0_len;
d6a28714 2256
df1ffd02 2257 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2258 pref_len++;
2259 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2260 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2261 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2262 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2263 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2264 l--;
d6a28714
JH
2265 if (pref0_len < 0)
2266 pref0_len = 0;
2267 if (pref0_len > pref_len)
2268 pref0_len = pref_len;
2269 regprop(prop, scan);
2a782b5b
JH
2270 {
2271 char *s0 =
df1ffd02 2272 do_utf8 ?
2a782b5b 2273 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2274 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2275 locinput - pref_len;
df1ffd02
JH
2276 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2277 char *s1 = do_utf8 ?
2a782b5b 2278 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2279 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2280 locinput - pref_len + pref0_len;
df1ffd02
JH
2281 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2282 char *s2 = do_utf8 ?
2a782b5b 2283 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2284 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2285 locinput;
df1ffd02 2286 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2287 PerlIO_printf(Perl_debug_log,
2288 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2289 (IV)(locinput - PL_bostr),
2290 PL_colors[4],
2291 len0, s0,
2292 PL_colors[5],
2293 PL_colors[2],
2294 len1, s1,
2295 PL_colors[3],
2296 (docolor ? "" : "> <"),
2297 PL_colors[0],
2298 len2, s2,
2299 PL_colors[1],
2300 15 - l - pref_len + 1,
2301 "",
2302 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2303 SvPVX(prop));
2304 }
2305 });
d6a28714
JH
2306
2307 next = scan + NEXT_OFF(scan);
2308 if (next == scan)
2309 next = NULL;
2310
2311 switch (OP(scan)) {
2312 case BOL:
12d33761
HS
2313 if (locinput == PL_bostr || (PL_multiline &&
2314 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2315 {
2316 /* regtill = regbol; */
b8c5462f
JH
2317 break;
2318 }
d6a28714
JH
2319 sayNO;
2320 case MBOL:
12d33761
HS
2321 if (locinput == PL_bostr ||
2322 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2323 {
b8c5462f
JH
2324 break;
2325 }
d6a28714
JH
2326 sayNO;
2327 case SBOL:
c2a73568 2328 if (locinput == PL_bostr)
b8c5462f 2329 break;
d6a28714
JH
2330 sayNO;
2331 case GPOS:
2332 if (locinput == PL_reg_ganch)
2333 break;
2334 sayNO;
2335 case EOL:
2336 if (PL_multiline)
2337 goto meol;
2338 else
2339 goto seol;
2340 case MEOL:
2341 meol:
2342 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2343 sayNO;
b8c5462f 2344 break;
d6a28714
JH
2345 case SEOL:
2346 seol:
2347 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2348 sayNO;
d6a28714 2349 if (PL_regeol - locinput > 1)
b8c5462f 2350 sayNO;
b8c5462f 2351 break;
d6a28714
JH
2352 case EOS:
2353 if (PL_regeol != locinput)
b8c5462f 2354 sayNO;
d6a28714 2355 break;
ffc61ed2 2356 case SANY:
d6a28714 2357 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2358 sayNO;
f33976b4
DB
2359 if (do_utf8) {
2360 locinput += PL_utf8skip[nextchr];
2361 if (locinput > PL_regeol)
2362 sayNO;
2363 nextchr = UCHARAT(locinput);
2364 }
2365 else
2366 nextchr = UCHARAT(++locinput);
2367 break;
2368 case CANY:
2369 if (!nextchr && locinput >= PL_regeol)
2370 sayNO;
b8c5462f 2371 nextchr = UCHARAT(++locinput);
a0d0e21e 2372 break;
ffc61ed2 2373 case REG_ANY:
1aa99e6b
IH
2374 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2375 sayNO;
2376 if (do_utf8) {
b8c5462f 2377 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2378 if (locinput > PL_regeol)
2379 sayNO;
a0ed51b3 2380 nextchr = UCHARAT(locinput);
a0ed51b3 2381 }
1aa99e6b
IH
2382 else
2383 nextchr = UCHARAT(++locinput);
a0ed51b3 2384 break;
d6a28714 2385 case EXACT:
cd439c50
IZ
2386 s = STRING(scan);
2387 ln = STR_LEN(scan);
eb160463 2388 if (do_utf8 != UTF) {
bc517b45 2389 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2390 char *l = locinput;
2391 char *e = s + ln;
bc517b45 2392 STRLEN ulen;
a72c7584 2393
5ff6fc6d
JH
2394 if (do_utf8) {
2395 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2396 while (s < e) {
2397 if (l >= PL_regeol)
5ff6fc6d
JH
2398 sayNO;
2399 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2400 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2401 ckWARN(WARN_UTF8) ?
2402 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2403 sayNO;
bc517b45 2404 l += ulen;
5ff6fc6d 2405 s ++;
1aa99e6b 2406 }
5ff6fc6d
JH
2407 }
2408 else {
2409 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2410 while (s < e) {
2411 if (l >= PL_regeol)
2412 sayNO;
5ff6fc6d 2413 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2414 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2415 ckWARN(WARN_UTF8) ?
2416 0 : UTF8_ALLOW_ANY))
1aa99e6b 2417 sayNO;
bc517b45 2418 s += ulen;
a72c7584 2419 l ++;
1aa99e6b 2420 }
5ff6fc6d 2421 }
1aa99e6b
IH
2422 locinput = l;
2423 nextchr = UCHARAT(locinput);
2424 break;
2425 }
bc517b45 2426 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2427 /* Inline the first character, for speed. */
2428 if (UCHARAT(s) != nextchr)
2429 sayNO;
2430 if (PL_regeol - locinput < ln)
2431 sayNO;
2432 if (ln > 1 && memNE(s, locinput, ln))
2433 sayNO;
2434 locinput += ln;
2435 nextchr = UCHARAT(locinput);
2436 break;
2437 case EXACTFL:
b8c5462f
JH
2438 PL_reg_flags |= RF_tainted;
2439 /* FALL THROUGH */
d6a28714 2440 case EXACTF:
cd439c50
IZ
2441 s = STRING(scan);
2442 ln = STR_LEN(scan);
d6a28714 2443
d07ddd77
JH
2444 if (do_utf8 || UTF) {
2445 /* Either target or the pattern are utf8. */
d6a28714 2446 char *l = locinput;
d07ddd77 2447 char *e = PL_regeol;
bc517b45 2448
eb160463 2449 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2450 l, &e, 0, do_utf8)) {
5486206c
JH
2451 /* One more case for the sharp s:
2452 * pack("U0U*", 0xDF) =~ /ss/i,
2453 * the 0xC3 0x9F are the UTF-8
2454 * byte sequence for the U+00DF. */
2455 if (!(do_utf8 &&
2456 toLOWER(s[0]) == 's' &&
2457 ln >= 2 &&
2458 toLOWER(s[1]) == 's' &&
2459 (U8)l[0] == 0xC3 &&
2460 e - l >= 2 &&
2461 (U8)l[1] == 0x9F))
2462 sayNO;
2463 }
d07ddd77
JH
2464 locinput = e;
2465 nextchr = UCHARAT(locinput);
2466 break;
a0ed51b3 2467 }
d6a28714 2468
bc517b45
JH
2469 /* Neither the target and the pattern are utf8. */
2470
d6a28714
JH
2471 /* Inline the first character, for speed. */
2472 if (UCHARAT(s) != nextchr &&
2473 UCHARAT(s) != ((OP(scan) == EXACTF)
2474 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2475 sayNO;
d6a28714 2476 if (PL_regeol - locinput < ln)
b8c5462f 2477 sayNO;
d6a28714
JH
2478 if (ln > 1 && (OP(scan) == EXACTF
2479 ? ibcmp(s, locinput, ln)
2480 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2481 sayNO;
d6a28714
JH
2482 locinput += ln;
2483 nextchr = UCHARAT(locinput);
a0d0e21e 2484 break;
d6a28714 2485 case ANYOF:
ffc61ed2 2486 if (do_utf8) {
9e55ce06
JH
2487 STRLEN inclasslen = PL_regeol - locinput;
2488
ba7b4546 2489 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2490 sayNO_ANYOF;
ffc61ed2
JH
2491 if (locinput >= PL_regeol)
2492 sayNO;
0f0076b4 2493 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2494 nextchr = UCHARAT(locinput);
e0f9d4a8 2495 break;
ffc61ed2
JH
2496 }
2497 else {
2498 if (nextchr < 0)
2499 nextchr = UCHARAT(locinput);
7d3e948e 2500 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2501 sayNO_ANYOF;
ffc61ed2
JH
2502 if (!nextchr && locinput >= PL_regeol)
2503 sayNO;
2504 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2505 break;
2506 }
2507 no_anyof:
2508 /* If we might have the case of the German sharp s
2509 * in a casefolding Unicode character class. */
2510
ebc501f0
JH
2511 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2512 locinput += SHARP_S_SKIP;
e0f9d4a8 2513 nextchr = UCHARAT(locinput);
ffc61ed2 2514 }
e0f9d4a8
JH
2515 else
2516 sayNO;
b8c5462f 2517 break;
d6a28714 2518 case ALNUML:
b8c5462f
JH
2519 PL_reg_flags |= RF_tainted;
2520 /* FALL THROUGH */
d6a28714 2521 case ALNUM:
b8c5462f 2522 if (!nextchr)
4633a7c4 2523 sayNO;
ffc61ed2 2524 if (do_utf8) {
ad24be35 2525 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2526 if (!(OP(scan) == ALNUM
3568d838 2527 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2528 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2529 {
2530 sayNO;
a0ed51b3 2531 }
b8c5462f 2532 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2533 nextchr = UCHARAT(locinput);
2534 break;
2535 }
ffc61ed2 2536 if (!(OP(scan) == ALNUM
d6a28714 2537 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2538 sayNO;
b8c5462f 2539 nextchr = UCHARAT(++locinput);
a0d0e21e 2540 break;
d6a28714 2541 case NALNUML:
b8c5462f
JH
2542 PL_reg_flags |= RF_tainted;
2543 /* FALL THROUGH */
d6a28714
JH
2544 case NALNUM:
2545 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2546 sayNO;
ffc61ed2 2547 if (do_utf8) {
8269fa76 2548 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2549 if (OP(scan) == NALNUM
3568d838 2550 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2551 : isALNUM_LC_utf8((U8*)locinput))
2552 {
b8c5462f 2553 sayNO;
d6a28714 2554 }
b8c5462f
JH
2555 locinput += PL_utf8skip[nextchr];
2556 nextchr = UCHARAT(locinput);
2557 break;
2558 }
ffc61ed2 2559 if (OP(scan) == NALNUM
d6a28714 2560 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2561 sayNO;
76e3520e 2562 nextchr = UCHARAT(++locinput);
a0d0e21e 2563 break;
d6a28714
JH
2564 case BOUNDL:
2565 case NBOUNDL:
3280af22 2566 PL_reg_flags |= RF_tainted;
bbce6d69 2567 /* FALL THROUGH */
d6a28714
JH
2568 case BOUND:
2569 case NBOUND:
2570 /* was last char in word? */
ffc61ed2 2571 if (do_utf8) {
12d33761
HS
2572 if (locinput == PL_bostr)
2573 ln = '\n';
ffc61ed2 2574 else {
b4f7163a 2575 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2576
b4f7163a 2577 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2578 }
2579 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2580 ln = isALNUM_uni(ln);
8269fa76 2581 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2582 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2583 }
2584 else {
9041c2e3 2585 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2586 n = isALNUM_LC_utf8((U8*)locinput);
2587 }
a0ed51b3 2588 }
d6a28714 2589 else {
12d33761
HS
2590 ln = (locinput != PL_bostr) ?
2591 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2592 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2593 ln = isALNUM(ln);
2594 n = isALNUM(nextchr);
2595 }
2596 else {
2597 ln = isALNUM_LC(ln);
2598 n = isALNUM_LC(nextchr);
2599 }
d6a28714 2600 }
ffc61ed2
JH
2601 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2602 OP(scan) == BOUNDL))
2603 sayNO;
a0ed51b3 2604 break;
d6a28714 2605 case SPACEL:
3280af22 2606 PL_reg_flags |= RF_tainted;
bbce6d69 2607 /* FALL THROUGH */
d6a28714 2608 case SPACE:
9442cb0e 2609 if (!nextchr)
4633a7c4 2610 sayNO;
1aa99e6b 2611 if (do_utf8) {
fd400ab9 2612 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2613 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2614 if (!(OP(scan) == SPACE
3568d838 2615 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2616 : isSPACE_LC_utf8((U8*)locinput)))
2617 {
2618 sayNO;
2619 }
2620 locinput += PL_utf8skip[nextchr];
2621 nextchr = UCHARAT(locinput);
2622 break;
d6a28714 2623 }
ffc61ed2
JH
2624 if (!(OP(scan) == SPACE
2625 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2626 sayNO;
2627 nextchr = UCHARAT(++locinput);
2628 }
2629 else {
2630 if (!(OP(scan) == SPACE
2631 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2632 sayNO;
2633 nextchr = UCHARAT(++locinput);
a0ed51b3 2634 }
a0ed51b3 2635 break;
d6a28714 2636 case NSPACEL:
3280af22 2637 PL_reg_flags |= RF_tainted;
bbce6d69 2638 /* FALL THROUGH */
d6a28714 2639 case NSPACE:
9442cb0e 2640 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2641 sayNO;
1aa99e6b 2642 if (do_utf8) {
8269fa76 2643 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2644 if (OP(scan) == NSPACE
3568d838 2645 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2646 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2647 {
2648 sayNO;
2649 }
2650 locinput += PL_utf8skip[nextchr];
2651 nextchr = UCHARAT(locinput);
2652 break;
a0ed51b3 2653 }
ffc61ed2 2654 if (OP(scan) == NSPACE
d6a28714 2655 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2656 sayNO;
76e3520e 2657 nextchr = UCHARAT(++locinput);
a0d0e21e 2658 break;
d6a28714 2659 case DIGITL:
a0ed51b3
LW
2660 PL_reg_flags |= RF_tainted;
2661 /* FALL THROUGH */
d6a28714 2662 case DIGIT:
9442cb0e 2663 if (!nextchr)
a0ed51b3 2664 sayNO;
1aa99e6b 2665 if (do_utf8) {
8269fa76 2666 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2667 if (!(OP(scan) == DIGIT
3568d838 2668 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2669 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2670 {
a0ed51b3 2671 sayNO;
dfe13c55 2672 }
6f06b55f 2673 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2674 nextchr = UCHARAT(locinput);
2675 break;
2676 }
ffc61ed2 2677 if (!(OP(scan) == DIGIT
9442cb0e 2678 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2679 sayNO;
2680 nextchr = UCHARAT(++locinput);
2681 break;
d6a28714 2682 case NDIGITL:
b8c5462f
JH
2683 PL_reg_flags |= RF_tainted;
2684 /* FALL THROUGH */
d6a28714 2685 case NDIGIT:
9442cb0e 2686 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2687 sayNO;
1aa99e6b 2688 if (do_utf8) {
8269fa76 2689 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2690 if (OP(scan) == NDIGIT
3568d838 2691 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2692 : isDIGIT_LC_utf8((U8*)locinput))
2693 {
a0ed51b3 2694 sayNO;
9442cb0e 2695 }
6f06b55f 2696 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2697 nextchr = UCHARAT(locinput);
2698 break;
2699 }
ffc61ed2 2700 if (OP(scan) == NDIGIT
9442cb0e 2701 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2702 sayNO;
2703 nextchr = UCHARAT(++locinput);
2704 break;
2705 case CLUMP:
b7c83a7e 2706 if (locinput >= PL_regeol)
a0ed51b3 2707 sayNO;
b7c83a7e
JH
2708 if (do_utf8) {
2709 LOAD_UTF8_CHARCLASS(mark,"~");
2710 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2711 sayNO;
2712 locinput += PL_utf8skip[nextchr];
2713 while (locinput < PL_regeol &&
2714 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2715 locinput += UTF8SKIP(locinput);
2716 if (locinput > PL_regeol)
2717 sayNO;
eb08e2da
JH
2718 }
2719 else
2720 locinput++;
a0ed51b3
LW
2721 nextchr = UCHARAT(locinput);
2722 break;
c8756f30 2723 case REFFL:
3280af22 2724 PL_reg_flags |= RF_tainted;
c8756f30 2725 /* FALL THROUGH */
c277df42 2726 case REF:
c8756f30 2727 case REFF:
c277df42 2728 n = ARG(scan); /* which paren pair */
cf93c79d 2729 ln = PL_regstartp[n];
2c2d71f5 2730 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2731 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2732 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2733 if (ln == PL_regendp[n])
a0d0e21e 2734 break;
a0ed51b3 2735
cf93c79d 2736 s = PL_bostr + ln;
1aa99e6b 2737 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2738 char *l = locinput;
cf93c79d 2739 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2740 /*
2741 * Note that we can't do the "other character" lookup trick as
2742 * in the 8-bit case (no pun intended) because in Unicode we
2743 * have to map both upper and title case to lower case.
2744 */
2745 if (OP(scan) == REFF) {
a2a2844f 2746 STRLEN ulen1, ulen2;
e7ae6809
JH
2747 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2748 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2749 while (s < e) {
2750 if (l >= PL_regeol)
2751 sayNO;
a2a2844f
JH
2752 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2753 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2754 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2755 sayNO;
a2a2844f
JH
2756 s += ulen1;
2757 l += ulen2;
a0ed51b3
LW
2758 }
2759 }
2760 locinput = l;
2761 nextchr = UCHARAT(locinput);
2762 break;
2763 }
2764
a0d0e21e 2765 /* Inline the first character, for speed. */
76e3520e 2766 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2767 (OP(scan) == REF ||
2768 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2769 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2770 sayNO;
cf93c79d 2771 ln = PL_regendp[n] - ln;
3280af22 2772 if (locinput + ln > PL_regeol)
4633a7c4 2773 sayNO;
c8756f30
AK
2774 if (ln > 1 && (OP(scan) == REF
2775 ? memNE(s, locinput, ln)
2776 : (OP(scan) == REFF
2777 ? ibcmp(s, locinput, ln)
2778 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2779 sayNO;
a0d0e21e 2780 locinput += ln;
76e3520e 2781 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2782 break;
2783
2784 case NOTHING:
c277df42 2785 case TAIL:
a0d0e21e
LW
2786 break;
2787 case BACK:
2788 break;
c277df42
IZ
2789 case EVAL:
2790 {
2791 dSP;
533c011a 2792 OP_4tree *oop = PL_op;
3280af22 2793 COP *ocurcop = PL_curcop;
f3548bdc 2794 PAD *old_comppad;
c277df42 2795 SV *ret;
9041c2e3 2796
c277df42 2797 n = ARG(scan);
533c011a 2798 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2799 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2800 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2801 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2802
8e5e9ebe
RGS
2803 {
2804 SV **before = SP;
2805 CALLRUNOPS(aTHX); /* Scalar context. */
2806 SPAGAIN;
2807 if (SP == before)
075aa684 2808 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2809 else {
2810 ret = POPs;
2811 PUTBACK;
2812 }
2813 }
2814
0f5d15d6 2815 PL_op = oop;
f3548bdc 2816 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2817 PL_curcop = ocurcop;
c277df42 2818 if (logical) {
0f5d15d6
IZ
2819 if (logical == 2) { /* Postponed subexpression. */
2820 regexp *re;
22c35a8c 2821 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2822 re_cc_state state;
0f5d15d6 2823 CHECKPOINT cp, lastcp;
cb50f42d 2824 int toggleutf;
0f5d15d6
IZ
2825
2826 if(SvROK(ret) || SvRMAGICAL(ret)) {
2827 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2828
2829 if(SvMAGICAL(sv))
14befaf4 2830 mg = mg_find(sv, PERL_MAGIC_qr);
0f5d15d6
IZ
2831 }
2832 if (mg) {
2833 re = (regexp *)mg->mg_obj;
df0003d4 2834 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2835 }
2836 else {
2837 STRLEN len;
2838 char *t = SvPV(ret, len);
2839 PMOP pm;
2840 char *oprecomp = PL_regprecomp;
2841 I32 osize = PL_regsize;
2842 I32 onpar = PL_regnpar;
2843
5fcd1c1b 2844 Zero(&pm, 1, PMOP);
cb50f42d 2845 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2846 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2847 if (!(SvFLAGS(ret)
0f5d15d6 2848 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
14befaf4
DM
2849 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2850 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2851 PL_regprecomp = oprecomp;
2852 PL_regsize = osize;
2853 PL_regnpar = onpar;
2854 }
2855 DEBUG_r(
9041c2e3 2856 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2857 "Entering embedded `%s%.60s%s%s'\n",
2858 PL_colors[0],
2859 re->precomp,
2860 PL_colors[1],
2861 (strlen(re->precomp) > 60 ? "..." : ""))
2862 );
2863 state.node = next;
2864 state.prev = PL_reg_call_cc;
2865 state.cc = PL_regcc;
2866 state.re = PL_reg_re;
2867
2ab05381 2868 PL_regcc = 0;
9041c2e3 2869
0f5d15d6 2870 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2871 REGCP_SET(lastcp);
0f5d15d6
IZ
2872 cache_re(re);
2873 state.ss = PL_savestack_ix;
2874 *PL_reglastparen = 0;
a01268b5 2875 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2876 PL_reg_call_cc = &state;
2877 PL_reginput = locinput;
cb50f42d
YST
2878 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2879 ((re->reganch & ROPT_UTF8) != 0);
2880 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2881
2882 /* XXXX This is too dramatic a measure... */
2883 PL_reg_maxiter = 0;
2884
0f5d15d6 2885 if (regmatch(re->program + 1)) {
2c914db6
IZ
2886 /* Even though we succeeded, we need to restore
2887 global variables, since we may be wrapped inside
2888 SUSPEND, thus the match may be not finished yet. */
2889
2890 /* XXXX Do this only if SUSPENDed? */
2891 PL_reg_call_cc = state.prev;
2892 PL_regcc = state.cc;
2893 PL_reg_re = state.re;
2894 cache_re(PL_reg_re);
cb50f42d 2895 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2896
2897 /* XXXX This is too dramatic a measure... */
2898 PL_reg_maxiter = 0;
2899
2900 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2901 ReREFCNT_dec(re);
2902 regcpblow(cp);
2903 sayYES;
2904 }
0f5d15d6 2905 ReREFCNT_dec(re);
02db2b7b 2906 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2907 regcppop();
2908 PL_reg_call_cc = state.prev;
2909 PL_regcc = state.cc;
2910 PL_reg_re = state.re;
d3790889 2911 cache_re(PL_reg_re);
cb50f42d 2912 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2913
2914 /* XXXX This is too dramatic a measure... */
2915 PL_reg_maxiter = 0;
2916
8e514ae6 2917 logical = 0;
0f5d15d6
IZ
2918 sayNO;
2919 }
c277df42 2920 sw = SvTRUE(ret);
0f5d15d6 2921 logical = 0;
a0ed51b3
LW
2922 }
2923 else
3280af22 2924 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2925 break;
2926 }
a0d0e21e 2927 case OPEN:
c277df42 2928 n = ARG(scan); /* which paren pair */
3280af22
NIS
2929 PL_reg_start_tmp[n] = locinput;
2930 if (n > PL_regsize)
2931 PL_regsize = n;
a0d0e21e
LW
2932 break;
2933 case CLOSE:
c277df42 2934 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2935 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2936 PL_regendp[n] = locinput - PL_bostr;
eb160463 2937 if (n > (I32)*PL_reglastparen)
3280af22 2938 *PL_reglastparen = n;
a01268b5 2939 *PL_reglastcloseparen = n;
a0d0e21e 2940 break;
c277df42
IZ
2941 case GROUPP:
2942 n = ARG(scan); /* which paren pair */
eb160463 2943 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2944 break;
2945 case IFTHEN:
2c2d71f5 2946 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2947 if (sw)
2948 next = NEXTOPER(NEXTOPER(scan));
2949 else {
2950 next = scan + ARG(scan);
2951 if (OP(next) == IFTHEN) /* Fake one. */
2952 next = NEXTOPER(NEXTOPER(next));
2953 }
2954 break;
2955 case LOGICAL:
0f5d15d6 2956 logical = scan->flags;
c277df42 2957 break;
2ab05381
IZ
2958/*******************************************************************
2959 PL_regcc contains infoblock about the innermost (...)* loop, and
2960 a pointer to the next outer infoblock.
2961
2962 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2963
2964 1) After matching X, regnode for CURLYX is processed;
2965
9041c2e3 2966 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
2967 regmatch() recursively with the starting point at WHILEM node;
2968
2969 3) Each hit of WHILEM node tries to match A and Z (in the order
2970 depending on the current iteration, min/max of {min,max} and
2971 greediness). The information about where are nodes for "A"
2972 and "Z" is read from the infoblock, as is info on how many times "A"
2973 was already matched, and greediness.
2974
2975 4) After A matches, the same WHILEM node is hit again.
2976
2977 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2978 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2979 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2980 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2981 of the external loop.
2982
2983 Currently present infoblocks form a tree with a stem formed by PL_curcc
2984 and whatever it mentions via ->next, and additional attached trees
2985 corresponding to temporarily unset infoblocks as in "5" above.
2986
9041c2e3 2987 In the following picture infoblocks for outer loop of
2ab05381
IZ
2988 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2989 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2990 infoblocks are drawn below the "reset" infoblock.
2991
2992 In fact in the picture below we do not show failed matches for Z and T
2993 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2994 more obvious *why* one needs to *temporary* unset infoblocks.]
2995
2996 Matched REx position InfoBlocks Comment
2997 (Y(A)*?Z)*?T x
2998 Y(A)*?Z)*?T x <- O
2999 Y (A)*?Z)*?T x <- O
3000 Y A)*?Z)*?T x <- O <- I
3001 YA )*?Z)*?T x <- O <- I
3002 YA A)*?Z)*?T x <- O <- I
3003 YAA )*?Z)*?T x <- O <- I
3004 YAA Z)*?T x <- O # Temporary unset I
3005 I
3006
3007 YAAZ Y(A)*?Z)*?T x <- O
3008 I
3009
3010 YAAZY (A)*?Z)*?T x <- O
3011 I
3012
3013 YAAZY A)*?Z)*?T x <- O <- I
3014 I
3015
3016 YAAZYA )*?Z)*?T x <- O <- I
3017 I
3018
3019 YAAZYA Z)*?T x <- O # Temporary unset I
3020 I,I
3021
3022 YAAZYAZ )*?T x <- O
3023 I,I
3024
3025 YAAZYAZ T x # Temporary unset O
3026 O
3027 I,I
3028
3029 YAAZYAZT x
3030 O
3031 I,I
3032 *******************************************************************/
a0d0e21e
LW
3033 case CURLYX: {
3034 CURCUR cc;
3280af22 3035 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3036 /* No need to save/restore up to this paren */
3037 I32 parenfloor = scan->flags;
c277df42
IZ
3038
3039 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3040 next += ARG(next);
3280af22
NIS
3041 cc.oldcc = PL_regcc;
3042 PL_regcc = &cc;
cb434fcc
IZ
3043 /* XXXX Probably it is better to teach regpush to support
3044 parenfloor > PL_regsize... */
eb160463 3045 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3046 parenfloor = *PL_reglastparen; /* Pessimization... */
3047 cc.parenfloor = parenfloor;
a0d0e21e
LW
3048 cc.cur = -1;
3049 cc.min = ARG1(scan);
3050 cc.max = ARG2(scan);
c277df42 3051 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3052 cc.next = next;
3053 cc.minmod = minmod;
3054 cc.lastloc = 0;
3280af22 3055 PL_reginput = locinput;
a0d0e21e
LW
3056 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3057 regcpblow(cp);
3280af22 3058 PL_regcc = cc.oldcc;
4633a7c4 3059 saySAME(n);
a0d0e21e
LW
3060 }
3061 /* NOT REACHED */
3062 case WHILEM: {
3063 /*
3064 * This is really hard to understand, because after we match
3065 * what we're trying to match, we must make sure the rest of
2c2d71f5 3066 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3067 * to go back UP the parse tree by recursing ever deeper. And
3068 * if it fails, we have to reset our parent's current state
3069 * that we can try again after backing off.
3070 */
3071
c277df42 3072 CHECKPOINT cp, lastcp;
3280af22 3073 CURCUR* cc = PL_regcc;
c277df42
IZ
3074 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3075
4633a7c4 3076 n = cc->cur + 1; /* how many we know we matched */
3280af22 3077 PL_reginput = locinput;
a0d0e21e 3078
c277df42 3079 DEBUG_r(
9041c2e3 3080 PerlIO_printf(Perl_debug_log,
91f3b821 3081 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3082 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3083 (long)n, (long)cc->min,
2797576d 3084 (long)cc->max, PTR2UV(cc))
c277df42 3085 );
4633a7c4 3086
a0d0e21e
LW
3087 /* If degenerate scan matches "", assume scan done. */
3088
579cf2c3 3089 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3090 PL_regcc = cc->oldcc;
2ab05381
IZ
3091 if (PL_regcc)
3092 ln = PL_regcc->cur;
c277df42 3093 DEBUG_r(
c3464db5
DD
3094 PerlIO_printf(Perl_debug_log,
3095 "%*s empty match detected, try continuation...\n",
3280af22 3096 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3097 );
a0d0e21e 3098 if (regmatch(cc->next))
4633a7c4 3099 sayYES;
2ab05381
IZ
3100 if (PL_regcc)
3101 PL_regcc->cur = ln;
3280af22 3102 PL_regcc = cc;
4633a7c4 3103 sayNO;
a0d0e21e
LW
3104 }
3105
3106 /* First just match a string of min scans. */
3107
3108 if (n < cc->min) {
3109 cc->cur = n;
3110 cc->lastloc = locinput;
4633a7c4
LW
3111 if (regmatch(cc->scan))
3112 sayYES;
3113 cc->cur = n - 1;
c277df42 3114 cc->lastloc = lastloc;
4633a7c4 3115 sayNO;
a0d0e21e
LW
3116 }
3117
2c2d71f5
JH
3118 if (scan->flags) {
3119 /* Check whether we already were at this position.
3120 Postpone detection until we know the match is not
3121 *that* much linear. */
3122 if (!PL_reg_maxiter) {
3123 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3124 PL_reg_leftiter = PL_reg_maxiter;
3125 }
3126 if (PL_reg_leftiter-- == 0) {
3127 I32 size = (PL_reg_maxiter + 7)/8;
3128 if (PL_reg_poscache) {
eb160463 3129 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3130 Renew(PL_reg_poscache, size, char);
3131 PL_reg_poscache_size = size;
3132 }
3133 Zero(PL_reg_poscache, size, char);
3134 }
3135 else {
3136 PL_reg_poscache_size = size;
3137 Newz(29, PL_reg_poscache, size, char);
3138 }
3139 DEBUG_r(
3140 PerlIO_printf(Perl_debug_log,
3141 "%sDetected a super-linear match, switching on caching%s...\n",
3142 PL_colors[4], PL_colors[5])
3143 );
3144 }
3145 if (PL_reg_leftiter < 0) {
3146 I32 o = locinput - PL_bostr, b;
3147
3148 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3149 b = o % 8;
3150 o /= 8;
3151 if (PL_reg_poscache[o] & (1<<b)) {
3152 DEBUG_r(
3153 PerlIO_printf(Perl_debug_log,
3154 "%*s already tried at this position...\n",
3155 REPORT_CODE_OFF+PL_regindent*2, "")
3156 );
7821416a 3157 sayNO_SILENT;
2c2d71f5
JH
3158 }
3159 PL_reg_poscache[o] |= (1<<b);
3160 }
3161 }
3162
a0d0e21e
LW
3163 /* Prefer next over scan for minimal matching. */
3164
3165 if (cc->minmod) {
3280af22 3166 PL_regcc = cc->oldcc;
2ab05381
IZ
3167 if (PL_regcc)
3168 ln = PL_regcc->cur;
5f05dabc 3169 cp = regcppush(cc->parenfloor);
02db2b7b 3170 REGCP_SET(lastcp);
5f05dabc 3171 if (regmatch(cc->next)) {
c277df42 3172 regcpblow(cp);
4633a7c4 3173 sayYES; /* All done. */
5f05dabc 3174 }
02db2b7b 3175 REGCP_UNWIND(lastcp);
5f05dabc 3176 regcppop();
2ab05381
IZ
3177 if (PL_regcc)
3178 PL_regcc->cur = ln;
3280af22 3179 PL_regcc = cc;
a0d0e21e 3180
c277df42 3181 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3182 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3183 && !(PL_reg_flags & RF_warned)) {
3184 PL_reg_flags |= RF_warned;
9014280d 3185 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3186 "Complex regular subexpression recursion",
3187 REG_INFTY - 1);
c277df42 3188 }
4633a7c4 3189 sayNO;
c277df42 3190 }
a687059c 3191
c277df42 3192 DEBUG_r(
c3464db5
DD
3193 PerlIO_printf(Perl_debug_log,
3194 "%*s trying longer...\n",
3280af22 3195 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3196 );
a0d0e21e 3197 /* Try scanning more and see if it helps. */
3280af22 3198 PL_reginput = locinput;
a0d0e21e
LW
3199 cc->cur = n;
3200 cc->lastloc = locinput;
5f05dabc 3201 cp = regcppush(cc->parenfloor);
02db2b7b 3202 REGCP_SET(lastcp);
5f05dabc 3203 if (regmatch(cc->scan)) {
c277df42 3204 regcpblow(cp);
4633a7c4 3205 sayYES;
5f05dabc 3206 }
02db2b7b 3207 REGCP_UNWIND(lastcp);
5f05dabc 3208 regcppop();
4633a7c4 3209 cc->cur = n - 1;
c277df42 3210 cc->lastloc = lastloc;
4633a7c4 3211 sayNO;
a0d0e21e
LW
3212 }
3213
3214 /* Prefer scan over next for maximal matching. */
3215
3216 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3217 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3218 cc->cur = n;
3219 cc->lastloc = locinput;
02db2b7b 3220 REGCP_SET(lastcp);
5f05dabc 3221 if (regmatch(cc->scan)) {
c277df42 3222 regcpblow(cp);
4633a7c4 3223 sayYES;
5f05dabc 3224 }
02db2b7b 3225 REGCP_UNWIND(lastcp);
a0d0e21e 3226 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3227 PL_reginput = locinput;
c277df42 3228 DEBUG_r(
c3464db5
DD
3229 PerlIO_printf(Perl_debug_log,
3230 "%*s failed, try continuation...\n",
3280af22 3231 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3232 );
3233 }
9041c2e3 3234 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3235 && !(PL_reg_flags & RF_warned)) {
3280af22 3236 PL_reg_flags |= RF_warned;
9014280d 3237 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3238 "Complex regular subexpression recursion",
3239 REG_INFTY - 1);
a0d0e21e
LW
3240 }
3241
3242 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3243 PL_regcc = cc->oldcc;
2ab05381
IZ
3244 if (PL_regcc)
3245 ln = PL_regcc->cur;
a0d0e21e 3246 if (regmatch(cc->next))
4633a7c4 3247 sayYES;
2ab05381
IZ
3248 if (PL_regcc)
3249 PL_regcc->cur = ln;
3280af22 3250 PL_regcc = cc;
4633a7c4 3251 cc->cur = n - 1;
c277df42 3252 cc->lastloc = lastloc;
4633a7c4 3253 sayNO;
a0d0e21e
LW
3254 }
3255 /* NOT REACHED */
9041c2e3 3256 case BRANCHJ:
c277df42
IZ
3257 next = scan + ARG(scan);
3258 if (next == scan)
3259 next = NULL;
3260 inner = NEXTOPER(NEXTOPER(scan));
3261 goto do_branch;
9041c2e3 3262 case BRANCH:
c277df42
IZ
3263 inner = NEXTOPER(scan);
3264 do_branch:
3265 {
c277df42
IZ
3266 c1 = OP(scan);
3267 if (OP(next) != c1) /* No choice. */
3268 next = inner; /* Avoid recursion. */
a0d0e21e 3269 else {
02db2b7b
IZ
3270 I32 lastparen = *PL_reglastparen;
3271 I32 unwind1;
3272 re_unwind_branch_t *uw;
3273
3274 /* Put unwinding data on stack */
3275 unwind1 = SSNEWt(1,re_unwind_branch_t);
3276 uw = SSPTRt(unwind1,re_unwind_branch_t);
3277 uw->prev = unwind;
3278 unwind = unwind1;
3279 uw->type = ((c1 == BRANCH)
3280 ? RE_UNWIND_BRANCH
3281 : RE_UNWIND_BRANCHJ);
3282 uw->lastparen = lastparen;
3283 uw->next = next;
3284 uw->locinput = locinput;
3285 uw->nextchr = nextchr;
3286#ifdef DEBUGGING
3287 uw->regindent = ++PL_regindent;
3288#endif
c277df42 3289
02db2b7b
IZ
3290 REGCP_SET(uw->lastcp);
3291
3292 /* Now go into the first branch */
3293 next = inner;
a687059c 3294 }
a0d0e21e
LW
3295 }
3296 break;
3297 case MINMOD:
3298 minmod = 1;
3299 break;
c277df42
IZ
3300 case CURLYM:
3301 {
00db4c45 3302 I32 l = 0;
c277df42 3303 CHECKPOINT lastcp;
9041c2e3 3304
c277df42
IZ
3305 /* We suppose that the next guy does not need
3306 backtracking: in particular, it is of constant length,
3307 and has no parenths to influence future backrefs. */
3308 ln = ARG1(scan); /* min to match */
3309 n = ARG2(scan); /* max to match */
c277df42
IZ
3310 paren = scan->flags;
3311 if (paren) {
3280af22
NIS
3312 if (paren > PL_regsize)
3313 PL_regsize = paren;
eb160463 3314 if (paren > (I32)*PL_reglastparen)
3280af22 3315 *PL_reglastparen = paren;
c277df42 3316 }
dc45a647 3317 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3318 if (paren)
3319 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3320 PL_reginput = locinput;
c277df42
IZ
3321 if (minmod) {
3322 minmod = 0;
3323 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3324 sayNO;
f31a99c8
HS
3325 /* if we matched something zero-length we don't need to
3326 backtrack - capturing parens are already defined, so
3327 the caveat in the maximal case doesn't apply
3328
3329 XXXX if ln == 0, we can redo this check first time
3330 through the following loop
3331 */
3332 if (ln && l == 0)
3333 n = ln; /* don't backtrack */
3280af22 3334 locinput = PL_reginput;
cca55fe3 3335 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3336 regnode *text_node = next;
3337
cca55fe3 3338 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3339
cca55fe3 3340 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3341 else {
cca55fe3
JP
3342 if (PL_regkind[(U8)OP(text_node)] == REF) {
3343 I32 n, ln;
3344 n = ARG(text_node); /* which paren pair */
3345 ln = PL_regstartp[n];
3346 /* assume yes if we haven't seen CLOSEn */
3347 if (
eb160463 3348 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3349 ln == -1 ||
3350 ln == PL_regendp[n]
3351 ) {
3352 c1 = c2 = -1000;
3353 goto assume_ok_MM;
3354 }
3355 c1 = *(PL_bostr + ln);
3356 }
3357 else { c1 = (U8)*STRING(text_node); }
af5decee 3358 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3359 c2 = PL_fold[c1];
af5decee 3360 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3361 c2 = PL_fold_locale[c1];
3362 else
3363 c2 = c1;
3364 }
a0ed51b3
LW
3365 }
3366 else
c277df42 3367 c1 = c2 = -1000;
cca55fe3 3368 assume_ok_MM:
02db2b7b 3369 REGCP_SET(lastcp);
5f4b28b2 3370 /* This may be improved if l == 0. */
c277df42
IZ
3371 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3372 /* If it could work, try it. */
3373 if (c1 == -1000 ||
3280af22
NIS
3374 UCHARAT(PL_reginput) == c1 ||
3375 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3376 {
3377 if (paren) {
f31a99c8 3378 if (ln) {
cf93c79d
IZ
3379 PL_regstartp[paren] =
3380 HOPc(PL_reginput, -l) - PL_bostr;
3381 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3382 }
3383 else
cf93c79d 3384 PL_regendp[paren] = -1;
c277df42
IZ
3385 }
3386 if (regmatch(next))
3387 sayYES;
02db2b7b 3388 REGCP_UNWIND(lastcp);
c277df42
IZ
3389 }
3390 /* Couldn't or didn't -- move forward. */
3280af22 3391 PL_reginput = locinput;
c277df42
IZ
3392 if (regrepeat_hard(scan, 1, &l)) {
3393 ln++;
3280af22 3394 locinput = PL_reginput;
c277df42
IZ
3395 }
3396 else
3397 sayNO;
3398 }
a0ed51b3
LW
3399 }
3400 else {
c277df42 3401 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3402 /* if we matched something zero-length we don't need to
3403 backtrack, unless the minimum count is zero and we
3404 are capturing the result - in that case the capture
3405 being defined or not may affect later execution
3406 */
3407 if (n != 0 && l == 0 && !(paren && ln == 0))
3408 ln = n; /* don't backtrack */
3280af22 3409 locinput = PL_reginput;
c277df42 3410 DEBUG_r(
5c0ca799 3411 PerlIO_printf(Perl_debug_log,
faccc32b 3412 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3413 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3414 (IV) n, (IV)l)
c277df42
IZ
3415 );
3416 if (n >= ln) {
cca55fe3 3417 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3418 regnode *text_node = next;
3419
cca55fe3 3420 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3421
cca55fe3 3422 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3423 else {
cca55fe3
JP
3424 if (PL_regkind[(U8)OP(text_node)] == REF) {
3425 I32 n, ln;
3426 n = ARG(text_node); /* which paren pair */
3427 ln = PL_regstartp[n];
3428 /* assume yes if we haven't seen CLOSEn */
3429 if (
eb160463 3430 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3431 ln == -1 ||
3432 ln == PL_regendp[n]
3433 ) {
3434 c1 = c2 = -1000;
3435 goto assume_ok_REG;
3436 }
3437 c1 = *(PL_bostr + ln);
3438 }
3439 else { c1 = (U8)*STRING(text_node); }
3440
af5decee 3441 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3442 c2 = PL_fold[c1];
af5decee 3443 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3444 c2 = PL_fold_locale[c1];
3445 else
3446 c2 = c1;
3447 }
a0ed51b3
LW
3448 }
3449 else
c277df42
IZ
3450 c1 = c2 = -1000;
3451 }
cca55fe3 3452 assume_ok_REG:
02db2b7b 3453 REGCP_SET(lastcp);
c277df42
IZ
3454 while (n >= ln) {
3455 /* If it could work, try it. */
3456 if (c1 == -1000 ||
3280af22
NIS
3457 UCHARAT(PL_reginput) == c1 ||
3458 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3459 {
3460 DEBUG_r(
c3464db5 3461 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3462 "%*s trying tail with n=%"IVdf"...\n",
3463 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3464 );
3465 if (paren) {
3466 if (n) {
cf93c79d
IZ
3467 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3468 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3469 }
a0ed51b3 3470 else
cf93c79d 3471 PL_regendp[paren] = -1;
c277df42 3472 }
a0ed51b3
LW
3473 if (regmatch(next))
3474 sayYES;
02db2b7b 3475 REGCP_UNWIND(lastcp);
a0ed51b3 3476 }
c277df42
IZ
3477 /* Couldn't or didn't -- back up. */
3478 n--;
dfe13c55 3479 locinput = HOPc(locinput, -l);
3280af22 3480 PL_reginput = locinput;
c277df42
IZ
3481 }
3482 }
3483 sayNO;
3484 break;
3485 }
3486 case CURLYN:
3487 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3488 if (paren > PL_regsize)
3489 PL_regsize = paren;
eb160463 3490 if (paren > (I32)*PL_reglastparen)
3280af22 3491 *PL_reglastparen = paren;
c277df42
IZ
3492 ln = ARG1(scan); /* min to match */
3493 n = ARG2(scan); /* max to match */
dc45a647 3494 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3495 goto repeat;
a0d0e21e 3496 case CURLY:
c277df42 3497 paren = 0;
a0d0e21e
LW
3498 ln = ARG1(scan); /* min to match */
3499 n = ARG2(scan); /* max to match */
dc45a647 3500 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3501 goto repeat;
3502 case STAR:
3503 ln = 0;
c277df42 3504 n = REG_INFTY;
a0d0e21e 3505 scan = NEXTOPER(scan);
c277df42 3506 paren = 0;
a0d0e21e
LW
3507 goto repeat;
3508 case PLUS:
c277df42
IZ
3509 ln = 1;
3510 n = REG_INFTY;
3511 scan = NEXTOPER(scan);
3512 paren = 0;
3513 repeat:
a0d0e21e
LW
3514 /*
3515 * Lookahead to avoid useless match attempts
3516 * when we know what character comes next.
3517 */
5f80c4cf
JP
3518
3519 /*
3520 * Used to only do .*x and .*?x, but now it allows
3521 * for )'s, ('s and (?{ ... })'s to be in the way
3522 * of the quantifier and the EXACT-like node. -- japhy
3523 */
3524
cca55fe3 3525 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3526 U8 *s;
3527 regnode *text_node = next;
3528
cca55fe3 3529 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3530
cca55fe3 3531 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3532 else {
cca55fe3
JP
3533 if (PL_regkind[(U8)OP(text_node)] == REF) {
3534 I32 n, ln;
3535 n = ARG(text_node); /* which paren pair */
3536 ln = PL_regstartp[n];
3537 /* assume yes if we haven't seen CLOSEn */
3538 if (
eb160463 3539 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3540 ln == -1 ||
3541 ln == PL_regendp[n]
3542 ) {
3543 c1 = c2 = -1000;
3544 goto assume_ok_easy;
3545 }
9246c65e 3546 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3547 }
3548 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3549
3550 if (!UTF) {
3551 c2 = c1 = *s;
f65d3ee7 3552 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3553 c2 = PL_fold[c1];
f65d3ee7 3554 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3555 c2 = PL_fold_locale[c1];
1aa99e6b 3556 }
5f80c4cf 3557 else { /* UTF */
f65d3ee7 3558 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3559 STRLEN ulen1, ulen2;
e7ae6809
JH
3560 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3561 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3562
3563 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3564 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3565
872c91ae
JH
3566 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3567 ckWARN(WARN_UTF8) ?
3568 0 : UTF8_ALLOW_ANY);
3569 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3570 ckWARN(WARN_UTF8) ?
3571 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3572 }
3573 else {
872c91ae
JH
3574 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3575 ckWARN(WARN_UTF8) ?
3576 0 : UTF8_ALLOW_ANY);
5f80c4cf 3577 }
1aa99e6b
IH
3578 }
3579 }
bbce6d69 3580 }
a0d0e21e 3581 else
bbce6d69 3582 c1 = c2 = -1000;
cca55fe3 3583 assume_ok_easy:
3280af22 3584 PL_reginput = locinput;
a0d0e21e 3585 if (minmod) {
c277df42 3586 CHECKPOINT lastcp;
a0d0e21e
LW
3587 minmod = 0;
3588 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3589 sayNO;
a0ed51b3 3590 locinput = PL_reginput;
02db2b7b 3591 REGCP_SET(lastcp);
0fe9bf95 3592 if (c1 != -1000) {
1aa99e6b 3593 char *e; /* Should not check after this */
0fe9bf95 3594 char *old = locinput;
b2f2f093 3595 int count = 0;
0fe9bf95 3596
1aa99e6b 3597 if (n == REG_INFTY) {
0fe9bf95 3598 e = PL_regeol - 1;
1aa99e6b
IH
3599 if (do_utf8)
3600 while (UTF8_IS_CONTINUATION(*(U8*)e))
3601 e--;
3602 }
3603 else if (do_utf8) {
3604 int m = n - ln;
3605 for (e = locinput;
3606 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3607 e += UTF8SKIP(e);
3608 }
3609 else {
3610 e = locinput + n - ln;
3611 if (e >= PL_regeol)
3612 e = PL_regeol - 1;
3613 }
0fe9bf95
IZ
3614 while (1) {
3615 /* Find place 'next' could work */
1aa99e6b
IH
3616 if (!do_utf8) {
3617 if (c1 == c2) {
a8e8ab15
JH
3618 while (locinput <= e &&
3619 UCHARAT(locinput) != c1)
1aa99e6b
IH
3620 locinput++;
3621 } else {
9041c2e3 3622 while (locinput <= e
a8e8ab15
JH
3623 && UCHARAT(locinput) != c1
3624 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3625 locinput++;
3626 }
3627 count = locinput - old;
3628 }
3629 else {
3630 STRLEN len;
3631 if (c1 == c2) {
872c91ae
JH
3632 /* count initialised to
3633 * utf8_distance(old, locinput) */
b2f2f093 3634 while (locinput <= e &&
872c91ae
JH
3635 utf8n_to_uvchr((U8*)locinput,
3636 UTF8_MAXLEN, &len,
3637 ckWARN(WARN_UTF8) ?
eb160463 3638 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3639 locinput += len;
b2f2f093
JH
3640 count++;
3641 }
1aa99e6b 3642 } else {
872c91ae
JH
3643 /* count initialised to
3644 * utf8_distance(old, locinput) */
b2f2f093 3645 while (locinput <= e) {
872c91ae
JH
3646 UV c = utf8n_to_uvchr((U8*)locinput,
3647 UTF8_MAXLEN, &len,
3648 ckWARN(WARN_UTF8) ?
3649 0 : UTF8_ALLOW_ANY);
eb160463 3650 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3651 break;
b2f2f093
JH
3652 locinput += len;
3653 count++;
1aa99e6b
IH
3654 }
3655 }
0fe9bf95 3656 }
9041c2e3 3657 if (locinput > e)
0fe9bf95
IZ
3658 sayNO;
3659 /* PL_reginput == old now */
3660 if (locinput != old) {
3661 ln = 1; /* Did some */
1aa99e6b 3662 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3663 sayNO;
3664 }
3665 /* PL_reginput == locinput now */
29d1e993 3666 TRYPAREN(paren, ln, locinput);
0fe9bf95 3667 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3668 REGCP_UNWIND(lastcp);
0fe9bf95 3669 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3670 old = locinput;
3671 if (do_utf8)
3672 locinput += UTF8SKIP(locinput);
3673 else
3674 locinput++;
b2f2f093 3675 count = 1;
0fe9bf95
IZ
3676 }
3677 }
3678 else
c277df42 3679 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3680 UV c;
3681 if (c1 != -1000) {
3682 if (do_utf8)
872c91ae
JH
3683 c = utf8n_to_uvchr((U8*)PL_reginput,
3684 UTF8_MAXLEN, 0,
3685 ckWARN(WARN_UTF8) ?
3686 0 : UTF8_ALLOW_ANY);
1aa99e6b 3687 else
9041c2e3 3688 c = UCHARAT(PL_reginput);
2390ecbc 3689 /* If it could work, try it. */
eb160463 3690 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3691 {
ecc99935 3692 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3693 REGCP_UNWIND(lastcp);
3694 }
1aa99e6b 3695 }
a0d0e21e 3696 /* If it could work, try it. */
2390ecbc 3697 else if (c1 == -1000)
bbce6d69 3698 {
ecc99935 3699 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3700 REGCP_UNWIND(lastcp);
bbce6d69 3701 }
c277df42 3702 /* Couldn't or didn't -- move forward. */
a0ed51b3 3703 PL_reginput = locinput;
a0d0e21e
LW
3704 if (regrepeat(scan, 1)) {
3705 ln++;
a0ed51b3
LW
3706 locinput = PL_reginput;
3707 }
3708 else
4633a7c4 3709 sayNO;
a0d0e21e
LW
3710 }
3711 }
3712 else {
c277df42 3713 CHECKPOINT lastcp;
a0d0e21e 3714 n = regrepeat(scan, n);
a0ed51b3 3715 locinput = PL_reginput;
22c35a8c 3716 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3717 ((!PL_multiline && OP(next) != MEOL) ||
3718 OP(next) == SEOL || OP(next) == EOS))
3719 {
a0d0e21e 3720 ln = n; /* why back off? */
1aeab75a
GS
3721 /* ...because $ and \Z can match before *and* after
3722 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3723 We should back off by one in this case. */
3724 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3725 ln--;
3726 }
02db2b7b 3727 REGCP_SET(lastcp);
c277df42 3728 if (paren) {
8fa7f367 3729 UV c = 0;
c277df42 3730 while (n >= ln) {
1aa99e6b
IH
3731 if (c1 != -1000) {
3732 if (do_utf8)
872c91ae
JH
3733 c = utf8n_to_uvchr((U8*)PL_reginput,
3734 UTF8_MAXLEN, 0,
3735 ckWARN(WARN_UTF8) ?
3736 0 : UTF8_ALLOW_ANY);
1aa99e6b 3737 else
9041c2e3 3738 c = UCHARAT(PL_reginput);
1aa99e6b 3739 }
c277df42 3740 /* If it could work, try it. */
eb160463 3741 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3742 {
29d1e993 3743 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3744 REGCP_UNWIND(lastcp);
c277df42
IZ
3745 }
3746 /* Couldn't or didn't -- back up. */
3747 n--;
dfe13c55 3748 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3749 }
a0ed51b3
LW
3750 }
3751 else {
8fa7f367 3752 UV c = 0;
c277df42 3753 while (n >= ln) {
1aa99e6b
IH
3754 if (c1 != -1000) {
3755 if (do_utf8)
872c91ae
JH
3756 c = utf8n_to_uvchr((U8*)PL_reginput,
3757 UTF8_MAXLEN, 0,
3758 ckWARN(WARN_UTF8) ?
3759 0 : UTF8_ALLOW_ANY);
1aa99e6b 3760 else
9041c2e3 3761 c = UCHARAT(PL_reginput);
1aa99e6b 3762 }
c277df42 3763 /* If it could work, try it. */
eb160463 3764 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3765 {
29d1e993 3766 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3767 REGCP_UNWIND(lastcp);
c277df42
IZ
3768 }
3769 /* Couldn't or didn't -- back up. */
3770 n--;
dfe13c55 3771 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3772 }
a0d0e21e
LW
3773 }
3774 }
4633a7c4 3775 sayNO;
c277df42 3776 break;
a0d0e21e 3777 case END:
0f5d15d6
IZ
3778 if (PL_reg_call_cc) {
3779 re_cc_state *cur_call_cc = PL_reg_call_cc;
3780 CURCUR *cctmp = PL_regcc;
3781 regexp *re = PL_reg_re;
3782 CHECKPOINT cp, lastcp;
3783
3784 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3785 REGCP_SET(lastcp);
0f5d15d6
IZ
3786 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3787 the caller. */
3788 PL_reginput = locinput; /* Make position available to
3789 the callcc. */
3790 cache_re(PL_reg_call_cc->re);
3791 PL_regcc = PL_reg_call_cc->cc;
3792 PL_reg_call_cc = PL_reg_call_cc->prev;
3793 if (regmatch(cur_call_cc->node)) {
3794 PL_reg_call_cc = cur_call_cc;
3795 regcpblow(cp);
3796 sayYES;
3797 }
02db2b7b 3798 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3799 regcppop();
3800 PL_reg_call_cc = cur_call_cc;
3801 PL_regcc = cctmp;
3802 PL_reg_re = re;
3803 cache_re(re);
3804
3805 DEBUG_r(
3806 PerlIO_printf(Perl_debug_log,
3807 "%*s continuation failed...\n",
3808 REPORT_CODE_OFF+PL_regindent*2, "")
3809 );
7821416a 3810 sayNO_SILENT;
0f5d15d6 3811 }
7821416a
IZ
3812 if (locinput < PL_regtill) {
3813 DEBUG_r(PerlIO_printf(Perl_debug_log,
3814 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3815 PL_colors[4],
3816 (long)(locinput - PL_reg_starttry),
3817 (long)(PL_regtill - PL_reg_starttry),
3818 PL_colors[5]));
3819 sayNO_FINAL; /* Cannot match: too short. */
3820 }
3821 PL_reginput = locinput; /* put where regtry can find it */
3822 sayYES_FINAL; /* Success! */
7e5428c5 3823 case SUCCEED:
3280af22 3824 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3825 sayYES_LOUD; /* Success! */
c277df42
IZ
3826 case SUSPEND:
3827 n = 1;
9fe1d20c 3828 PL_reginput = locinput;
9041c2e3 3829 goto do_ifmatch;
a0d0e21e 3830 case UNLESSM:
c277df42 3831 n = 0;
a0ed51b3 3832 if (scan->flags) {
efb30f32
HS
3833 s = HOPBACKc(locinput, scan->flags);
3834 if (!s)
3835 goto say_yes;
3836 PL_reginput = s;
a0ed51b3
LW
3837 }
3838 else
3839 PL_reginput = locinput;
c277df42
IZ
3840 goto do_ifmatch;
3841 case IFMATCH:
3842 n = 1;
a0ed51b3 3843 if (scan->flags) {
efb30f32
HS
3844 s = HOPBACKc(locinput, scan->flags);
3845 if (!s)
3846 goto say_no;
3847 PL_reginput = s;
a0ed51b3
LW
3848 }
3849 else
3850 PL_reginput = locinput;
3851
c277df42 3852 do_ifmatch:
c277df42
IZ
3853 inner = NEXTOPER(NEXTOPER(scan));
3854 if (regmatch(inner) != n) {
3855 say_no:
3856 if (logical) {
3857 logical = 0;
3858 sw = 0;
3859 goto do_longjump;
a0ed51b3
LW
3860 }
3861 else
c277df42
IZ
3862 sayNO;
3863 }
3864 say_yes:
3865 if (logical) {
3866 logical = 0;
3867 sw = 1;
3868 }
fe44a5e8 3869 if (OP(scan) == SUSPEND) {
3280af22 3870 locinput = PL_reginput;
565764a8 3871 nextchr = UCHARAT(locinput);
fe44a5e8 3872 }
c277df42
IZ
3873 /* FALL THROUGH. */
3874 case LONGJMP:
3875 do_longjump:
3876 next = scan + ARG(scan);
3877 if (next == scan)
3878 next = NULL;
a0d0e21e
LW
3879 break;
3880 default:
b900a521 3881 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3882 PTR2UV(scan), OP(scan));
cea2e8a9 3883 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3884 }
02db2b7b 3885 reenter:
a0d0e21e
LW
3886 scan = next;
3887 }
a687059c 3888
a0d0e21e
LW
3889 /*
3890 * We get here only if there's trouble -- normally "case END" is
3891 * the terminating point.
3892 */
cea2e8a9 3893 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3894 /*NOTREACHED*/
4633a7c4
LW
3895 sayNO;
3896
7821416a
IZ
3897yes_loud:
3898 DEBUG_r(
3899 PerlIO_printf(Perl_debug_log,
3900 "%*s %scould match...%s\n",
3901 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3902 );
3903 goto yes;
3904yes_final:
3905 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3906 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3907yes:
3908#ifdef DEBUGGING
3280af22 3909 PL_regindent--;
4633a7c4 3910#endif
02db2b7b
IZ
3911
3912#if 0 /* Breaks $^R */
3913 if (unwind)
3914 regcpblow(firstcp);
3915#endif
4633a7c4
LW
3916 return 1;
3917
3918no:
7821416a
IZ
3919 DEBUG_r(
3920 PerlIO_printf(Perl_debug_log,
3921 "%*s %sfailed...%s\n",
3922 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3923 );
3924 goto do_no;
3925no_final:
3926do_no:
02db2b7b
IZ
3927 if (unwind) {
3928 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3929
3930 switch (uw->type) {
3931 case RE_UNWIND_BRANCH:
3932 case RE_UNWIND_BRANCHJ:
3933 {
3934 re_unwind_branch_t *uwb = &(uw->branch);
3935 I32 lastparen = uwb->lastparen;
9041c2e3 3936
02db2b7b
IZ
3937 REGCP_UNWIND(uwb->lastcp);
3938 for (n = *PL_reglastparen; n > lastparen; n--)
3939 PL_regendp[n] = -1;
3940 *PL_reglastparen = n;
3941 scan = next = uwb->next;
9041c2e3
NIS
3942 if ( !scan ||
3943 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3944 ? BRANCH : BRANCHJ) ) { /* Failure */
3945 unwind = uwb->prev;
3946#ifdef DEBUGGING
3947 PL_regindent--;
3948#endif
3949 goto do_no;
3950 }
3951 /* Have more choice yet. Reuse the same uwb. */
3952 /*SUPPRESS 560*/
3953 if ((n = (uwb->type == RE_UNWIND_BRANCH
3954 ? NEXT_OFF(next) : ARG(next))))
3955 next += n;
3956 else
3957 next = NULL; /* XXXX Needn't unwinding in this case... */
3958 uwb->next = next;
3959 next = NEXTOPER(scan);
3960 if (uwb->type == RE_UNWIND_BRANCHJ)
3961 next = NEXTOPER(next);
3962 locinput = uwb->locinput;
3963 nextchr = uwb->nextchr;
3964#ifdef DEBUGGING
3965 PL_regindent = uwb->regindent;
3966#endif
3967
3968 goto reenter;
3969 }
3970 /* NOT REACHED */
3971 default:
3972 Perl_croak(aTHX_ "regexp unwind memory corruption");
3973 }
3974 /* NOT REACHED */
3975 }
4633a7c4 3976#ifdef DEBUGGING
3280af22 3977 PL_regindent--;
4633a7c4 3978#endif
a0d0e21e 3979 return 0;
a687059c
LW
3980}
3981
3982/*
3983 - regrepeat - repeatedly match something simple, report how many
3984 */
3985/*
3986 * [This routine now assumes that it will only match on things of length 1.
3987 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 3988 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 3989 */
76e3520e 3990STATIC I32
cea2e8a9 3991S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 3992{
a0d0e21e 3993 register char *scan;
a0d0e21e 3994 register I32 c;
3280af22 3995 register char *loceol = PL_regeol;
a0ed51b3 3996 register I32 hardcount = 0;
53c4c00c 3997 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 3998
3280af22 3999 scan = PL_reginput;
faf11cac
HS
4000 if (max == REG_INFTY)
4001 max = I32_MAX;
4002 else if (max < loceol - scan)
a0d0e21e 4003 loceol = scan + max;
a0d0e21e 4004 switch (OP(p)) {
22c35a8c 4005 case REG_ANY:
1aa99e6b 4006 if (do_utf8) {
ffc61ed2 4007 loceol = PL_regeol;
1aa99e6b 4008 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4009 scan += UTF8SKIP(scan);
4010 hardcount++;
4011 }
4012 } else {
4013 while (scan < loceol && *scan != '\n')
4014 scan++;
a0ed51b3
LW
4015 }
4016 break;
ffc61ed2 4017 case SANY:
def8e4ea
JH
4018 if (do_utf8) {
4019 loceol = PL_regeol;
a0804c9e 4020 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4021 scan += UTF8SKIP(scan);
4022 hardcount++;
4023 }
4024 }
4025 else
4026 scan = loceol;
a0ed51b3 4027 break;
f33976b4
DB
4028 case CANY:
4029 scan = loceol;
4030 break;
bbce6d69 4031 case EXACT: /* length of string is 1 */
cd439c50 4032 c = (U8)*STRING(p);
bbce6d69 4033 while (scan < loceol && UCHARAT(scan) == c)
4034 scan++;
4035 break;
4036 case EXACTF: /* length of string is 1 */
cd439c50 4037 c = (U8)*STRING(p);
bbce6d69 4038 while (scan < loceol &&
22c35a8c 4039 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4040 scan++;
4041 break;
4042 case EXACTFL: /* length of string is 1 */
3280af22 4043 PL_reg_flags |= RF_tainted;
cd439c50 4044 c = (U8)*STRING(p);
bbce6d69 4045 while (scan < loceol &&
22c35a8c 4046 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4047 scan++;
4048 break;
4049 case ANYOF:
ffc61ed2
JH
4050 if (do_utf8) {
4051 loceol = PL_regeol;
1aa99e6b 4052 while (hardcount < max && scan < loceol &&
ba7b4546 4053 reginclass(p, (U8*)scan, 0, do_utf8)) {
ffc61ed2
JH
4054 scan += UTF8SKIP(scan);
4055 hardcount++;
4056 }
4057 } else {
7d3e948e 4058 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4059 scan++;
4060 }
a0d0e21e
LW
4061 break;
4062 case ALNUM:
1aa99e6b 4063 if (do_utf8) {
ffc61ed2 4064 loceol = PL_regeol;
8269fa76 4065 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4066 while (hardcount < max && scan < loceol &&
3568d838 4067 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4068 scan += UTF8SKIP(scan);
4069 hardcount++;
4070 }
4071 } else {
4072 while (scan < loceol && isALNUM(*scan))
4073 scan++;
a0ed51b3
LW
4074 }
4075 break;
bbce6d69 4076 case ALNUML:
3280af22 4077 PL_reg_flags |= RF_tainted;
1aa99e6b 4078 if (do_utf8) {
ffc61ed2 4079 loceol = PL_regeol;
1aa99e6b
IH
4080 while (hardcount < max && scan < loceol &&
4081 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4082 scan += UTF8SKIP(scan);
4083 hardcount++;
4084 }
4085 } else {
4086 while (scan < loceol && isALNUM_LC(*scan))
4087 scan++;
a0ed51b3
LW
4088 }
4089 break;
a0d0e21e 4090 case NALNUM:
1aa99e6b 4091 if (do_utf8) {
ffc61ed2 4092 loceol = PL_regeol;
8269fa76 4093 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4094 while (hardcount < max && scan < loceol &&
3568d838 4095 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4096 scan += UTF8SKIP(scan);
4097 hardcount++;
4098 }
4099 } else {
4100 while (scan < loceol && !isALNUM(*scan))
4101 scan++;
a0ed51b3
LW
4102 }
4103 break;
bbce6d69 4104 case NALNUML:
3280af22 4105 PL_reg_flags |= RF_tainted;
1aa99e6b 4106 if (do_utf8) {
ffc61ed2 4107 loceol = PL_regeol;
1aa99e6b
IH
4108 while (hardcount < max && scan < loceol &&
4109 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4110 scan += UTF8SKIP(scan);
4111 hardcount++;
4112 }
4113 } else {
4114 while (scan < loceol && !isALNUM_LC(*scan))
4115 scan++;
a0ed51b3
LW
4116 }
4117 break;
a0d0e21e 4118 case SPACE:
1aa99e6b 4119 if (do_utf8) {
ffc61ed2 4120 loceol = PL_regeol;
8269fa76 4121 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4122 while (hardcount < max && scan < loceol &&
3568d838
JH
4123 (*scan == ' ' ||
4124 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4125 scan += UTF8SKIP(scan);
4126 hardcount++;
4127 }
4128 } else {
4129 while (scan < loceol && isSPACE(*scan))
4130 scan++;
a0ed51b3
LW
4131 }
4132 break;
bbce6d69 4133 case SPACEL:
3280af22 4134 PL_reg_flags |= RF_tainted;
1aa99e6b 4135 if (do_utf8) {
ffc61ed2 4136 loceol = PL_regeol;
1aa99e6b 4137 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4138 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4139 scan += UTF8SKIP(scan);
4140 hardcount++;
4141 }
4142 } else {
4143 while (scan < loceol && isSPACE_LC(*scan))
4144 scan++;
a0ed51b3
LW
4145 }
4146 break;
a0d0e21e 4147 case NSPACE:
1aa99e6b 4148 if (do_utf8) {
ffc61ed2 4149 loceol = PL_regeol;
8269fa76 4150 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4151 while (hardcount < max && scan < loceol &&
3568d838
JH
4152 !(*scan == ' ' ||
4153 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4154 scan += UTF8SKIP(scan);
4155 hardcount++;
4156 }
4157 } else {
4158 while (scan < loceol && !isSPACE(*scan))
4159 scan++;
4160 break;
a0ed51b3 4161 }
bbce6d69 4162 case NSPACEL:
3280af22 4163 PL_reg_flags |= RF_tainted;
1aa99e6b 4164 if (do_utf8) {
ffc61ed2 4165 loceol = PL_regeol;
1aa99e6b 4166 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4167 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4168 scan += UTF8SKIP(scan);
4169 hardcount++;
4170 }
4171 } else {
4172 while (scan < loceol && !isSPACE_LC(*scan))
4173 scan++;
a0ed51b3
LW
4174 }
4175 break;
a0d0e21e 4176 case DIGIT:
1aa99e6b 4177 if (do_utf8) {
ffc61ed2 4178 loceol = PL_regeol;
8269fa76 4179 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4180 while (hardcount < max && scan < loceol &&
3568d838 4181 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4182 scan += UTF8SKIP(scan);
4183 hardcount++;
4184 }
4185 } else {
4186 while (scan < loceol && isDIGIT(*scan))
4187 scan++;
a0ed51b3
LW
4188 }
4189 break;
a0d0e21e 4190 case NDIGIT:
1aa99e6b 4191 if (do_utf8) {
ffc61ed2 4192 loceol = PL_regeol;
8269fa76 4193 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4194 while (hardcount < max && scan < loceol &&
3568d838 4195 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4196 scan += UTF8SKIP(scan);
4197 hardcount++;
4198 }
4199 } else {
4200 while (scan < loceol && !isDIGIT(*scan))
4201 scan++;
a0ed51b3
LW
4202 }
4203 break;
a0d0e21e
LW
4204 default: /* Called on something of 0 width. */
4205 break; /* So match right here or not at all. */
4206 }
a687059c 4207
a0ed51b3
LW
4208 if (hardcount)
4209 c = hardcount;
4210 else
4211 c = scan - PL_reginput;
3280af22 4212 PL_reginput = scan;
a687059c 4213
9041c2e3 4214 DEBUG_r(
c277df42
IZ
4215 {
4216 SV *prop = sv_newmortal();
4217
4218 regprop(prop, p);
9041c2e3
NIS
4219 PerlIO_printf(Perl_debug_log,
4220 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4221 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4222 });
9041c2e3 4223
a0d0e21e 4224 return(c);
a687059c
LW
4225}
4226
4227/*
c277df42 4228 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4229 *
c277df42
IZ
4230 * The repeater is supposed to have constant length.
4231 */
4232
76e3520e 4233STATIC I32
cea2e8a9 4234S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4235{
b7953727 4236 register char *scan = Nullch;
c277df42 4237 register char *start;
3280af22 4238 register char *loceol = PL_regeol;
a0ed51b3 4239 I32 l = 0;
708e3b05 4240 I32 count = 0, res = 1;
a0ed51b3
LW
4241
4242 if (!max)
4243 return 0;
c277df42 4244
3280af22 4245 start = PL_reginput;
53c4c00c 4246 if (PL_reg_match_utf8) {
708e3b05 4247 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4248 if (!count++) {
4249 l = 0;
4250 while (start < PL_reginput) {
4251 l++;
4252 start += UTF8SKIP(start);
4253 }
4254 *lp = l;
4255 if (l == 0)
4256 return max;
4257 }
4258 if (count == max)
4259 return count;
4260 }
4261 }
4262 else {
708e3b05 4263 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4264 if (!count++) {
4265 *lp = l = PL_reginput - start;
4266 if (max != REG_INFTY && l*max < loceol - scan)
4267 loceol = scan + l*max;
4268 if (l == 0)
4269 return max;
c277df42
IZ
4270 }
4271 }
4272 }
708e3b05 4273 if (!res)
3280af22 4274 PL_reginput = scan;
9041c2e3 4275
a0ed51b3 4276 return count;
c277df42
IZ
4277}
4278
4279/*
ffc61ed2
JH
4280- regclass_swash - prepare the utf8 swash
4281*/
4282
4283SV *
9e55ce06 4284Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4285{
9e55ce06
JH
4286 SV *sw = NULL;
4287 SV *si = NULL;
4288 SV *alt = NULL;
ffc61ed2
JH
4289
4290 if (PL_regdata && PL_regdata->count) {
4291 U32 n = ARG(node);
4292
4293 if (PL_regdata->what[n] == 's') {
4294 SV *rv = (SV*)PL_regdata->data[n];
4295 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4296 SV **ary = AvARRAY(av);
9e55ce06 4297 SV **a, **b;
9041c2e3 4298
9e55ce06
JH
4299 /* See the end of regcomp.c:S_reglass() for
4300 * documentation of these array elements. */
4301
b11f357e
JH
4302 si = *ary;
4303 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4304 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4305
ffc61ed2
JH
4306 if (a)
4307 sw = *a;
4308 else if (si && doinit) {
4309 sw = swash_init("utf8", "", si, 1, 0);
4310 (void)av_store(av, 1, sw);
4311 }
9e55ce06
JH
4312 if (b)
4313 alt = *b;
ffc61ed2
JH
4314 }
4315 }
4316
9e55ce06
JH
4317 if (listsvp)
4318 *listsvp = si;
4319 if (altsvp)
4320 *altsvp = alt;
ffc61ed2
JH
4321
4322 return sw;
4323}
4324
4325/*
ba7b4546 4326 - reginclass - determine if a character falls into a character class
832705d4
JH
4327
4328 The n is the ANYOF regnode, the p is the target string, lenp
4329 is pointer to the maximum length of how far to go in the p
4330 (if the lenp is zero, UTF8SKIP(p) is used),
4331 do_utf8 tells whether the target string is in UTF-8.
4332
bbce6d69 4333 */
4334
76e3520e 4335STATIC bool
ba7b4546 4336S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4337{
ffc61ed2 4338 char flags = ANYOF_FLAGS(n);
bbce6d69 4339 bool match = FALSE;
cc07378b 4340 UV c = *p;
ae9ddab8 4341 STRLEN len = 0;
9e55ce06 4342 STRLEN plen;
1aa99e6b 4343
ae9ddab8
JH
4344 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4345 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4346 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4347
0f0076b4 4348 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4349 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4350 if (lenp)
4351 *lenp = 0;
ffc61ed2 4352 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4353 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4354 match = TRUE;
bbce6d69 4355 }
3568d838 4356 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4357 match = TRUE;
ffc61ed2 4358 if (!match) {
9e55ce06
JH
4359 AV *av;
4360 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4361
4362 if (sw) {
3568d838 4363 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4364 match = TRUE;
4365 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4366 if (!match && lenp && av) {
4367 I32 i;
4368
4369 for (i = 0; i <= av_len(av); i++) {
4370 SV* sv = *av_fetch(av, i, FALSE);
4371 STRLEN len;
4372 char *s = SvPV(sv, len);
4373
061b10df 4374 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4375 *lenp = len;
4376 match = TRUE;
4377 break;
4378 }
4379 }
4380 }
4381 if (!match) {
4a623e43
JH
4382 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4383 STRLEN tmplen;
4384
9e55ce06
JH
4385 to_utf8_fold(p, tmpbuf, &tmplen);
4386 if (swash_fetch(sw, tmpbuf, do_utf8))
4387 match = TRUE;
4388 }
ffc61ed2
JH
4389 }
4390 }
bbce6d69 4391 }
9e55ce06 4392 if (match && lenp && *lenp == 0)
0f0076b4 4393 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4394 }
1aa99e6b 4395 if (!match && c < 256) {
ffc61ed2
JH
4396 if (ANYOF_BITMAP_TEST(n, c))
4397 match = TRUE;
4398 else if (flags & ANYOF_FOLD) {
eb160463 4399 U8 f;
a0ed51b3 4400
ffc61ed2
JH
4401 if (flags & ANYOF_LOCALE) {
4402 PL_reg_flags |= RF_tainted;
4403 f = PL_fold_locale[c];
4404 }
4405 else
4406 f = PL_fold[c];
4407 if (f != c && ANYOF_BITMAP_TEST(n, f))
4408 match = TRUE;
4409 }
4410
4411 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4412 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4413 if (
4414 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4415 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4416 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4417 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4418 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4419 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4420 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4421 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4422 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4423 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4424 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4425 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4426 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4427 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4428 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4429 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4430 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4431 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4432 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4433 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4434 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4435 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4436 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4437 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4438 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4439 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4440 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4441 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4442 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4443 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4444 ) /* How's that for a conditional? */
4445 {
4446 match = TRUE;
4447 }
a0ed51b3 4448 }
a0ed51b3
LW
4449 }
4450
a0ed51b3
LW
4451 return (flags & ANYOF_INVERT) ? !match : match;
4452}
161b471a 4453
dfe13c55 4454STATIC U8 *
cea2e8a9 4455S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4456{
1aa99e6b
IH
4457 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4458}
4459
4460STATIC U8 *
4461S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4462{
a0ed51b3 4463 if (off >= 0) {
1aa99e6b 4464 while (off-- && s < lim) {
ffc61ed2 4465 /* XXX could check well-formedness here */
a0ed51b3 4466 s += UTF8SKIP(s);
ffc61ed2 4467 }
a0ed51b3
LW
4468 }
4469 else {
4470 while (off++) {
1aa99e6b 4471 if (s > lim) {
a0ed51b3 4472 s--;
ffc61ed2 4473 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4474 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4475 s--;
ffc61ed2
JH
4476 }
4477 /* XXX could check well-formedness here */
a0ed51b3
LW
4478 }
4479 }
4480 }
4481 return s;
4482}
161b471a 4483
dfe13c55 4484STATIC U8 *
1aa99e6b 4485S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4486{
1aa99e6b
IH
4487 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4488}
4489
4490STATIC U8 *
4491S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4492{
4493 if (off >= 0) {
1aa99e6b 4494 while (off-- && s < lim) {
ffc61ed2 4495 /* XXX could check well-formedness here */
a0ed51b3 4496 s += UTF8SKIP(s);
ffc61ed2 4497 }
a0ed51b3
LW
4498 if (off >= 0)
4499 return 0;
4500 }
4501 else {
4502 while (off++) {
1aa99e6b 4503 if (s > lim) {
a0ed51b3 4504 s--;
ffc61ed2 4505 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4506 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4507 s--;
ffc61ed2
JH
4508 }
4509 /* XXX could check well-formedness here */
a0ed51b3
LW
4510 }
4511 else
4512 break;
4513 }
4514 if (off <= 0)
4515 return 0;
4516 }
4517 return s;
4518}
51371543 4519
51371543 4520static void
acfe0abc 4521restore_pos(pTHX_ void *arg)
51371543 4522{
51371543
GS
4523 if (PL_reg_eval_set) {
4524 if (PL_reg_oldsaved) {
4525 PL_reg_re->subbeg = PL_reg_oldsaved;
4526 PL_reg_re->sublen = PL_reg_oldsavedlen;
4527 RX_MATCH_COPIED_on(PL_reg_re);
4528 }
4529 PL_reg_magic->mg_len = PL_reg_oldpos;
4530 PL_reg_eval_set = 0;
4531 PL_curpm = PL_reg_oldcurpm;
4532 }
4533}
33b8afdf
JH
4534
4535STATIC void
4536S_to_utf8_substr(pTHX_ register regexp *prog)
4537{
4538 SV* sv;
4539 if (prog->float_substr && !prog->float_utf8) {
4540 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4541 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4542 sv_utf8_upgrade(sv);
4543 if (SvTAIL(prog->float_substr))
4544 SvTAIL_on(sv);
4545 if (prog->float_substr == prog->check_substr)
4546 prog->check_utf8 = sv;
4547 }
4548 if (prog->anchored_substr && !prog->anchored_utf8) {
4549 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4550 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4551 sv_utf8_upgrade(sv);
4552 if (SvTAIL(prog->anchored_substr))
4553 SvTAIL_on(sv);
4554 if (prog->anchored_substr == prog->check_substr)
4555 prog->check_utf8 = sv;
4556 }
4557}
4558
4559STATIC void
4560S_to_byte_substr(pTHX_ register regexp *prog)
4561{
4562 SV* sv;
4563 if (prog->float_utf8 && !prog->float_substr) {
4564 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4565 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4566 if (sv_utf8_downgrade(sv, TRUE)) {
4567 if (SvTAIL(prog->float_utf8))
4568 SvTAIL_on(sv);
4569 } else {
4570 SvREFCNT_dec(sv);
4571 prog->float_substr = sv = &PL_sv_undef;
4572 }
4573 if (prog->float_utf8 == prog->check_utf8)
4574 prog->check_substr = sv;
4575 }
4576 if (prog->anchored_utf8 && !prog->anchored_substr) {
4577 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4578 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4579 if (sv_utf8_downgrade(sv, TRUE)) {
4580 if (SvTAIL(prog->anchored_utf8))
4581 SvTAIL_on(sv);
4582 } else {
4583 SvREFCNT_dec(sv);
4584 prog->anchored_substr = sv = &PL_sv_undef;
4585 }
4586 if (prog->anchored_utf8 == prog->check_utf8)
4587 prog->check_substr = sv;
4588 }
4589}