This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [perl #19236] perlsyn: implicit localisation in while()
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
be3c0a43 70 **** Copyright (c) 1991-2002, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
eb160463 90#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
ba7b4546 99#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 100
c277df42
IZ
101/*
102 * Forwards.
103 */
104
33b8afdf 105#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 106#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 107
dfe13c55
GS
108#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
109#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
110#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
111#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
112#define HOPc(pos,off) ((char*)HOP(pos,off))
113#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 114
efb30f32 115#define HOPBACK(pos, off) ( \
e54858b0 116 (PL_reg_match_utf8) \
efb30f32
HS
117 ? reghopmaybe((U8*)pos, -off) \
118 : (pos - off >= PL_bostr) \
119 ? (U8*)(pos - off) \
120 : (U8*)NULL \
121)
122#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
123
1aa99e6b
IH
124#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
125#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
127#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
128#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
129#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
130
977e4b84 131#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 132
5f80c4cf 133/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
134#define JUMPABLE(rn) ( \
135 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
136 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
137 OP(rn) == PLUS || OP(rn) == MINMOD || \
138 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
139)
140
cca55fe3
JP
141#define HAS_TEXT(rn) ( \
142 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
143)
e2d8ce26 144
a84d97b6
HS
145/*
146 Search for mandatory following text node; for lookahead, the text must
147 follow but for lookbehind (rn->flags != 0) we skip to the next step.
148*/
cca55fe3 149#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 150 while (JUMPABLE(rn)) \
a84d97b6 151 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 152 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
153 else if (OP(rn) == PLUS) \
154 rn = NEXTOPER(rn); \
a84d97b6
HS
155 else if (OP(rn) == IFMATCH) \
156 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 157 else rn += NEXT_OFF(rn); \
5f80c4cf 158} STMT_END
74750237 159
acfe0abc 160static void restore_pos(pTHX_ void *arg);
51371543 161
76e3520e 162STATIC CHECKPOINT
cea2e8a9 163S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 164{
3280af22 165 int retval = PL_savestack_ix;
b1ce53c5
JH
166#define REGCP_PAREN_ELEMS 4
167 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
168 int p;
169
e49a9654
IH
170 if (paren_elems_to_push < 0)
171 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
172
a01268b5 173#define REGCP_OTHER_ELEMS 6
b1ce53c5 174 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 175 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 176/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
177 SSPUSHINT(PL_regendp[p]);
178 SSPUSHINT(PL_regstartp[p]);
3280af22 179 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
180 SSPUSHINT(p);
181 }
b1ce53c5 182/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
183 SSPUSHINT(PL_regsize);
184 SSPUSHINT(*PL_reglastparen);
a01268b5 185 SSPUSHINT(*PL_reglastcloseparen);
3280af22 186 SSPUSHPTR(PL_reginput);
41123dfd
JH
187#define REGCP_FRAME_ELEMS 2
188/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
189 * are needed for the regexp context stack bookkeeping. */
190 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 191 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 192
a0d0e21e
LW
193 return retval;
194}
195
c277df42 196/* These are needed since we do not localize EVAL nodes: */
02db2b7b 197# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 198 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 199 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 200
02db2b7b 201# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 202 PerlIO_printf(Perl_debug_log, \
faccc32b 203 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 204 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 205
76e3520e 206STATIC char *
cea2e8a9 207S_regcppop(pTHX)
a0d0e21e 208{
b1ce53c5 209 I32 i;
a0d0e21e
LW
210 U32 paren = 0;
211 char *input;
cf93c79d 212 I32 tmps;
b1ce53c5
JH
213
214 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 215 i = SSPOPINT;
b1ce53c5
JH
216 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
217 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 218 input = (char *) SSPOPPTR;
a01268b5 219 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
220 *PL_reglastparen = SSPOPINT;
221 PL_regsize = SSPOPINT;
b1ce53c5
JH
222
223 /* Now restore the parentheses context. */
41123dfd
JH
224 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
225 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 226 paren = (U32)SSPOPINT;
3280af22 227 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
228 PL_regstartp[paren] = SSPOPINT;
229 tmps = SSPOPINT;
3280af22
NIS
230 if (paren <= *PL_reglastparen)
231 PL_regendp[paren] = tmps;
c277df42 232 DEBUG_r(
c3464db5 233 PerlIO_printf(Perl_debug_log,
b900a521 234 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 235 (UV)paren, (IV)PL_regstartp[paren],
b900a521 236 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 237 (IV)PL_regendp[paren],
3280af22 238 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 239 );
a0d0e21e 240 }
c277df42 241 DEBUG_r(
eb160463 242 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 243 PerlIO_printf(Perl_debug_log,
faccc32b
JH
244 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
245 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
246 }
247 );
daf18116 248#if 1
dafc8851
JH
249 /* It would seem that the similar code in regtry()
250 * already takes care of this, and in fact it is in
251 * a better location to since this code can #if 0-ed out
252 * but the code in regtry() is needed or otherwise tests
253 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
254 * (as of patchlevel 7877) will fail. Then again,
255 * this code seems to be necessary or otherwise
256 * building DynaLoader will fail:
257 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
258 * --jhi */
eb160463
GS
259 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
260 if ((I32)paren > PL_regsize)
cf93c79d
IZ
261 PL_regstartp[paren] = -1;
262 PL_regendp[paren] = -1;
a0d0e21e 263 }
dafc8851 264#endif
a0d0e21e
LW
265 return input;
266}
267
0f5d15d6 268STATIC char *
cea2e8a9 269S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
270{
271 I32 tmp = PL_savestack_ix;
272
273 PL_savestack_ix = ss;
274 regcppop();
275 PL_savestack_ix = tmp;
942e002e 276 return Nullch;
0f5d15d6
IZ
277}
278
279typedef struct re_cc_state
280{
281 I32 ss;
282 regnode *node;
283 struct re_cc_state *prev;
284 CURCUR *cc;
285 regexp *re;
286} re_cc_state;
287
02db2b7b 288#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 289
29d1e993
HS
290#define TRYPAREN(paren, n, input) { \
291 if (paren) { \
292 if (n) { \
293 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
294 PL_regendp[paren] = input - PL_bostr; \
295 } \
296 else \
297 PL_regendp[paren] = -1; \
298 } \
299 if (regmatch(next)) \
300 sayYES; \
301 if (paren && n) \
302 PL_regendp[paren] = -1; \
303}
304
305
a687059c 306/*
e50aee73 307 * pregexec and friends
a687059c
LW
308 */
309
310/*
c277df42 311 - pregexec - match a regexp against a string
a687059c 312 */
c277df42 313I32
864dbfa3 314Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 315 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
316/* strend: pointer to null at end of string */
317/* strbeg: real beginning of string */
318/* minend: end of match must be >=minend after stringarg. */
319/* nosave: For optimizations. */
320{
321 return
9041c2e3 322 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
323 nosave ? 0 : REXEC_COPY_STR);
324}
0f5d15d6
IZ
325
326STATIC void
cea2e8a9 327S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
328{
329 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
330#ifdef DEBUGGING
331 PL_regprogram = prog->program;
332#endif
333 PL_regnpar = prog->nparens;
9041c2e3
NIS
334 PL_regdata = prog->data;
335 PL_reg_re = prog;
0f5d15d6 336}
22e551b9 337
9041c2e3 338/*
cad2e5aa
JH
339 * Need to implement the following flags for reg_anch:
340 *
341 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
342 * USE_INTUIT_ML
343 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
344 * INTUIT_AUTORITATIVE_ML
345 * INTUIT_ONCE_NOML - Intuit can match in one location only.
346 * INTUIT_ONCE_ML
347 *
348 * Another flag for this function: SECOND_TIME (so that float substrs
349 * with giant delta may be not rechecked).
350 */
351
352/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
353
2c2d71f5 354/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
355 Otherwise, only SvCUR(sv) is used to get strbeg. */
356
357/* XXXX We assume that strpos is strbeg unless sv. */
358
6eb5f6b9
JH
359/* XXXX Some places assume that there is a fixed substring.
360 An update may be needed if optimizer marks as "INTUITable"
361 RExen without fixed substrings. Similarly, it is assumed that
362 lengths of all the strings are no more than minlen, thus they
363 cannot come from lookahead.
364 (Or minlen should take into account lookahead.) */
365
2c2d71f5
JH
366/* A failure to find a constant substring means that there is no need to make
367 an expensive call to REx engine, thus we celebrate a failure. Similarly,
368 finding a substring too deep into the string means that less calls to
30944b6d
IZ
369 regtry() should be needed.
370
371 REx compiler's optimizer found 4 possible hints:
372 a) Anchored substring;
373 b) Fixed substring;
374 c) Whether we are anchored (beginning-of-line or \G);
375 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 376 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
377 string which does not contradict any of them.
378 */
2c2d71f5 379
6eb5f6b9
JH
380/* Most of decisions we do here should have been done at compile time.
381 The nodes of the REx which we used for the search should have been
382 deleted from the finite automaton. */
383
cad2e5aa
JH
384char *
385Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
386 char *strend, U32 flags, re_scream_pos_data *data)
387{
b7953727 388 register I32 start_shift = 0;
cad2e5aa 389 /* Should be nonnegative! */
b7953727 390 register I32 end_shift = 0;
2c2d71f5
JH
391 register char *s;
392 register SV *check;
a1933d95 393 char *strbeg;
cad2e5aa 394 char *t;
33b8afdf 395 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 396 I32 ml_anch;
6eb5f6b9 397 register char *other_last = Nullch; /* other substr checked before this */
b7953727 398 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
399#ifdef DEBUGGING
400 char *i_strpos = strpos;
ce333219 401 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 402#endif
cad2e5aa 403
b8d68ded
JH
404 if (prog->reganch & ROPT_UTF8) {
405 DEBUG_r(PerlIO_printf(Perl_debug_log,
406 "UTF-8 regex...\n"));
407 PL_reg_flags |= RF_utf8;
408 }
409
2a782b5b 410 DEBUG_r({
b8d68ded 411 char *s = PL_reg_match_utf8 ?
c728cb41
JH
412 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
413 strpos;
b8d68ded
JH
414 int len = PL_reg_match_utf8 ?
415 strlen(s) : strend - strpos;
2a782b5b
JH
416 if (!PL_colorset)
417 reginitcolors();
b8d68ded
JH
418 if (PL_reg_match_utf8)
419 DEBUG_r(PerlIO_printf(Perl_debug_log,
420 "UTF-8 target...\n"));
2a782b5b
JH
421 PerlIO_printf(Perl_debug_log,
422 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
423 PL_colors[4],PL_colors[5],PL_colors[0],
424 prog->precomp,
425 PL_colors[1],
426 (strlen(prog->precomp) > 60 ? "..." : ""),
427 PL_colors[0],
428 (int)(len > 60 ? 60 : len),
429 s, PL_colors[1],
430 (len > 60 ? "..." : "")
431 );
432 });
cad2e5aa 433
c344f387
JH
434 /* CHR_DIST() would be more correct here but it makes things slow. */
435 if (prog->minlen > strend - strpos) {
a72c7584
JH
436 DEBUG_r(PerlIO_printf(Perl_debug_log,
437 "String too short... [re_intuit_start]\n"));
cad2e5aa 438 goto fail;
2c2d71f5 439 }
a1933d95 440 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 441 PL_regeol = strend;
33b8afdf
JH
442 if (do_utf8) {
443 if (!prog->check_utf8 && prog->check_substr)
444 to_utf8_substr(prog);
445 check = prog->check_utf8;
446 } else {
447 if (!prog->check_substr && prog->check_utf8)
448 to_byte_substr(prog);
449 check = prog->check_substr;
450 }
451 if (check == &PL_sv_undef) {
452 DEBUG_r(PerlIO_printf(Perl_debug_log,
453 "Non-utf string cannot match utf check string\n"));
454 goto fail;
455 }
2c2d71f5 456 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
457 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
458 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 459 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 460
7e25d62c
JH
461 if (!ml_anch) {
462 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
463 | ROPT_IMPLICIT)) /* not a real BOL */
464 /* SvCUR is not set on references: SvRV and SvPVX overlap */
465 && sv && !SvROK(sv)
466 && (strpos != strbeg)) {
467 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
468 goto fail;
469 }
470 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 471 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 472 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
473 I32 slen;
474
1aa99e6b 475 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
476 if (SvTAIL(check)) {
477 slen = SvCUR(check); /* >= 1 */
cad2e5aa 478
9041c2e3 479 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
480 || (strend - s == slen && strend[-1] != '\n')) {
481 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
482 goto fail_finish;
cad2e5aa
JH
483 }
484 /* Now should match s[0..slen-2] */
485 slen--;
653099ff 486 if (slen && (*SvPVX(check) != *s
cad2e5aa 487 || (slen > 1
653099ff 488 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
489 report_neq:
490 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
491 goto fail_finish;
492 }
cad2e5aa 493 }
653099ff
GS
494 else if (*SvPVX(check) != *s
495 || ((slen = SvCUR(check)) > 1
496 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
497 goto report_neq;
498 goto success_at_start;
7e25d62c 499 }
cad2e5aa 500 }
2c2d71f5 501 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 502 s = strpos;
2c2d71f5 503 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 504 end_shift = prog->minlen - start_shift -
653099ff 505 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 506 if (!ml_anch) {
653099ff
GS
507 I32 end = prog->check_offset_max + CHR_SVLEN(check)
508 - (SvTAIL(check) != 0);
1aa99e6b 509 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
510
511 if (end_shift < eshift)
512 end_shift = eshift;
513 }
cad2e5aa 514 }
2c2d71f5 515 else { /* Can match at random position */
cad2e5aa
JH
516 ml_anch = 0;
517 s = strpos;
2c2d71f5
JH
518 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
519 /* Should be nonnegative! */
520 end_shift = prog->minlen - start_shift -
653099ff 521 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
522 }
523
2c2d71f5 524#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 525 if (end_shift < 0)
6bbae5e6 526 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
527#endif
528
2c2d71f5
JH
529 restart:
530 /* Find a possible match in the region s..strend by looking for
531 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 532 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
533 I32 p = -1; /* Internal iterator of scream. */
534 I32 *pp = data ? data->scream_pos : &p;
535
2c2d71f5
JH
536 if (PL_screamfirst[BmRARE(check)] >= 0
537 || ( BmRARE(check) == '\n'
538 && (BmPREVIOUS(check) == SvCUR(check) - 1)
539 && SvTAIL(check) ))
9041c2e3 540 s = screaminstr(sv, check,
2c2d71f5 541 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 542 else
2c2d71f5 543 goto fail_finish;
cad2e5aa
JH
544 if (data)
545 *data->scream_olds = s;
546 }
f33976b4 547 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
548 s = fbm_instr((U8*)(s + start_shift),
549 (U8*)(strend - end_shift),
550 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 551 else
1aa99e6b
IH
552 s = fbm_instr(HOP3(s, start_shift, strend),
553 HOP3(strend, -end_shift, strbeg),
2c2d71f5 554 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
555
556 /* Update the count-of-usability, remove useless subpatterns,
557 unshift s. */
2c2d71f5
JH
558
559 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
560 (s ? "Found" : "Did not find"),
33b8afdf 561 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 562 PL_colors[0],
7b0972df
JH
563 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
564 SvPVX(check),
2c2d71f5
JH
565 PL_colors[1], (SvTAIL(check) ? "$" : ""),
566 (s ? " at offset " : "...\n") ) );
567
568 if (!s)
569 goto fail_finish;
570
6eb5f6b9
JH
571 check_at = s;
572
2c2d71f5 573 /* Finish the diagnostic message */
30944b6d 574 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
575
576 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
577 Start with the other substr.
578 XXXX no SCREAM optimization yet - and a very coarse implementation
579 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
580 *always* match. Probably should be marked during compile...
581 Probably it is right to do no SCREAM here...
582 */
583
33b8afdf 584 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 585 /* Take into account the "other" substring. */
2c2d71f5
JH
586 /* XXXX May be hopelessly wrong for UTF... */
587 if (!other_last)
6eb5f6b9 588 other_last = strpos;
33b8afdf 589 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
590 do_other_anchored:
591 {
1aa99e6b 592 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 593 char *s1 = s;
33b8afdf 594 SV* must;
2c2d71f5 595
2c2d71f5
JH
596 t = s - prog->check_offset_max;
597 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 598 && (!do_utf8
1aa99e6b 599 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 600 && t > strpos)))
30944b6d 601 /* EMPTY */;
2c2d71f5
JH
602 else
603 t = strpos;
1aa99e6b 604 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
605 if (t < other_last) /* These positions already checked */
606 t = other_last;
1aa99e6b 607 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
608 if (last < last1)
609 last1 = last;
610 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
611 /* On end-of-str: see comment below. */
33b8afdf
JH
612 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
613 if (must == &PL_sv_undef) {
614 s = (char*)NULL;
615 DEBUG_r(must = prog->anchored_utf8); /* for debug */
616 }
617 else
618 s = fbm_instr(
619 (unsigned char*)t,
620 HOP3(HOP3(last1, prog->anchored_offset, strend)
621 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
622 must,
623 PL_multiline ? FBMrf_MULTILINE : 0
624 );
1aa99e6b
IH
625 DEBUG_r(PerlIO_printf(Perl_debug_log,
626 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
627 (s ? "Found" : "Contradicts"),
628 PL_colors[0],
33b8afdf
JH
629 (int)(SvCUR(must)
630 - (SvTAIL(must)!=0)),
631 SvPVX(must),
632 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
633 if (!s) {
634 if (last1 >= last2) {
635 DEBUG_r(PerlIO_printf(Perl_debug_log,
636 ", giving up...\n"));
637 goto fail_finish;
638 }
639 DEBUG_r(PerlIO_printf(Perl_debug_log,
640 ", trying floating at offset %ld...\n",
1aa99e6b
IH
641 (long)(HOP3c(s1, 1, strend) - i_strpos)));
642 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
643 s = HOP3c(last, 1, strend);
2c2d71f5
JH
644 goto restart;
645 }
646 else {
647 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 648 (long)(s - i_strpos)));
1aa99e6b
IH
649 t = HOP3c(s, -prog->anchored_offset, strbeg);
650 other_last = HOP3c(s, 1, strend);
30944b6d 651 s = s1;
2c2d71f5
JH
652 if (t == strpos)
653 goto try_at_start;
2c2d71f5
JH
654 goto try_at_offset;
655 }
30944b6d 656 }
2c2d71f5
JH
657 }
658 else { /* Take into account the floating substring. */
33b8afdf
JH
659 char *last, *last1;
660 char *s1 = s;
661 SV* must;
662
663 t = HOP3c(s, -start_shift, strbeg);
664 last1 = last =
665 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
666 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
667 last = HOP3c(t, prog->float_max_offset, strend);
668 s = HOP3c(t, prog->float_min_offset, strend);
669 if (s < other_last)
670 s = other_last;
2c2d71f5 671 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
672 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
673 /* fbm_instr() takes into account exact value of end-of-str
674 if the check is SvTAIL(ed). Since false positives are OK,
675 and end-of-str is not later than strend we are OK. */
676 if (must == &PL_sv_undef) {
677 s = (char*)NULL;
678 DEBUG_r(must = prog->float_utf8); /* for debug message */
679 }
680 else
2c2d71f5 681 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
682 (unsigned char*)last + SvCUR(must)
683 - (SvTAIL(must)!=0),
684 must, PL_multiline ? FBMrf_MULTILINE : 0);
685 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
686 (s ? "Found" : "Contradicts"),
687 PL_colors[0],
688 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
689 SvPVX(must),
690 PL_colors[1], (SvTAIL(must) ? "$" : "")));
691 if (!s) {
692 if (last1 == last) {
2c2d71f5 693 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
694 ", giving up...\n"));
695 goto fail_finish;
2c2d71f5 696 }
33b8afdf
JH
697 DEBUG_r(PerlIO_printf(Perl_debug_log,
698 ", trying anchored starting at offset %ld...\n",
699 (long)(s1 + 1 - i_strpos)));
700 other_last = last;
701 s = HOP3c(t, 1, strend);
702 goto restart;
703 }
704 else {
705 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
706 (long)(s - i_strpos)));
707 other_last = s; /* Fix this later. --Hugo */
708 s = s1;
709 if (t == strpos)
710 goto try_at_start;
711 goto try_at_offset;
712 }
2c2d71f5 713 }
cad2e5aa 714 }
2c2d71f5
JH
715
716 t = s - prog->check_offset_max;
2c2d71f5 717 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 718 && (!do_utf8
1aa99e6b
IH
719 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
720 && t > strpos))) {
2c2d71f5
JH
721 /* Fixed substring is found far enough so that the match
722 cannot start at strpos. */
723 try_at_offset:
cad2e5aa 724 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
725 /* Eventually fbm_*() should handle this, but often
726 anchored_offset is not 0, so this check will not be wasted. */
727 /* XXXX In the code below we prefer to look for "^" even in
728 presence of anchored substrings. And we search even
729 beyond the found float position. These pessimizations
730 are historical artefacts only. */
731 find_anchor:
2c2d71f5 732 while (t < strend - prog->minlen) {
cad2e5aa 733 if (*t == '\n') {
4ee3650e 734 if (t < check_at - prog->check_offset_min) {
33b8afdf 735 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
736 /* Since we moved from the found position,
737 we definitely contradict the found anchored
30944b6d
IZ
738 substr. Due to the above check we do not
739 contradict "check" substr.
740 Thus we can arrive here only if check substr
741 is float. Redo checking for "other"=="fixed".
742 */
9041c2e3 743 strpos = t + 1;
30944b6d
IZ
744 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
745 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
746 goto do_other_anchored;
747 }
4ee3650e
GS
748 /* We don't contradict the found floating substring. */
749 /* XXXX Why not check for STCLASS? */
cad2e5aa 750 s = t + 1;
2c2d71f5 751 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 752 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
753 goto set_useful;
754 }
4ee3650e
GS
755 /* Position contradicts check-string */
756 /* XXXX probably better to look for check-string
757 than for "\n", so one should lower the limit for t? */
758 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 759 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 760 other_last = strpos = s = t + 1;
cad2e5aa
JH
761 goto restart;
762 }
763 t++;
764 }
2c2d71f5
JH
765 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
766 PL_colors[0],PL_colors[1]));
767 goto fail_finish;
cad2e5aa 768 }
f5952150
GS
769 else {
770 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
771 PL_colors[0],PL_colors[1]));
772 }
cad2e5aa
JH
773 s = t;
774 set_useful:
33b8afdf 775 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
776 }
777 else {
f5952150 778 /* The found string does not prohibit matching at strpos,
2c2d71f5 779 - no optimization of calling REx engine can be performed,
f5952150
GS
780 unless it was an MBOL and we are not after MBOL,
781 or a future STCLASS check will fail this. */
2c2d71f5
JH
782 try_at_start:
783 /* Even in this situation we may use MBOL flag if strpos is offset
784 wrt the start of the string. */
05b4157f 785 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 786 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
787 /* May be due to an implicit anchor of m{.*foo} */
788 && !(prog->reganch & ROPT_IMPLICIT))
789 {
cad2e5aa
JH
790 t = strpos;
791 goto find_anchor;
792 }
30944b6d 793 DEBUG_r( if (ml_anch)
f5952150
GS
794 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
795 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 796 );
2c2d71f5 797 success_at_start:
30944b6d 798 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
799 && (do_utf8 ? (
800 prog->check_utf8 /* Could be deleted already */
801 && --BmUSEFUL(prog->check_utf8) < 0
802 && (prog->check_utf8 == prog->float_utf8)
803 ) : (
804 prog->check_substr /* Could be deleted already */
805 && --BmUSEFUL(prog->check_substr) < 0
806 && (prog->check_substr == prog->float_substr)
807 )))
66e933ab 808 {
cad2e5aa 809 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 810 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
811 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
812 if (do_utf8 ? prog->check_substr : prog->check_utf8)
813 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
814 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
815 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 816 check = Nullsv; /* abort */
cad2e5aa 817 s = strpos;
3cf5c195
IZ
818 /* XXXX This is a remnant of the old implementation. It
819 looks wasteful, since now INTUIT can use many
6eb5f6b9 820 other heuristics. */
cad2e5aa
JH
821 prog->reganch &= ~RE_USE_INTUIT;
822 }
823 else
824 s = strpos;
825 }
826
6eb5f6b9
JH
827 /* Last resort... */
828 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
829 if (prog->regstclass) {
830 /* minlen == 0 is possible if regstclass is \b or \B,
831 and the fixed substr is ''$.
832 Since minlen is already taken into account, s+1 is before strend;
833 accidentally, minlen >= 1 guaranties no false positives at s + 1
834 even for \b or \B. But (minlen? 1 : 0) below assumes that
835 regstclass does not come from lookahead... */
836 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
837 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 838 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 839 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 840 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 841 : 1);
33b8afdf 842 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 843 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 844 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
845 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
846 cl_l, strend)
847 : strend);
a1933d95 848 char *startpos = strbeg;
6eb5f6b9
JH
849
850 t = s;
76384e4a 851 if (prog->reganch & ROPT_UTF8) {
ffc61ed2 852 PL_regdata = prog->data;
76384e4a
GS
853 PL_bostr = startpos;
854 }
f33976b4 855 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
856 if (!s) {
857#ifdef DEBUGGING
b7953727 858 char *what = 0;
6eb5f6b9
JH
859#endif
860 if (endpos == strend) {
861 DEBUG_r( PerlIO_printf(Perl_debug_log,
862 "Could not match STCLASS...\n") );
863 goto fail;
864 }
66e933ab
GS
865 DEBUG_r( PerlIO_printf(Perl_debug_log,
866 "This position contradicts STCLASS...\n") );
653099ff
GS
867 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
868 goto fail;
6eb5f6b9 869 /* Contradict one of substrings */
33b8afdf
JH
870 if (prog->anchored_substr || prog->anchored_utf8) {
871 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
872 DEBUG_r( what = "anchored" );
873 hop_and_restart:
1aa99e6b 874 s = HOP3c(t, 1, strend);
66e933ab
GS
875 if (s + start_shift + end_shift > strend) {
876 /* XXXX Should be taken into account earlier? */
877 DEBUG_r( PerlIO_printf(Perl_debug_log,
878 "Could not match STCLASS...\n") );
879 goto fail;
880 }
5e39e1e5
HS
881 if (!check)
882 goto giveup;
6eb5f6b9 883 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 884 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
885 what, (long)(s + start_shift - i_strpos)) );
886 goto restart;
887 }
66e933ab 888 /* Have both, check_string is floating */
6eb5f6b9
JH
889 if (t + start_shift >= check_at) /* Contradicts floating=check */
890 goto retry_floating_check;
891 /* Recheck anchored substring, but not floating... */
9041c2e3 892 s = check_at;
5e39e1e5
HS
893 if (!check)
894 goto giveup;
6eb5f6b9 895 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 896 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
897 (long)(other_last - i_strpos)) );
898 goto do_other_anchored;
899 }
60e71179
GS
900 /* Another way we could have checked stclass at the
901 current position only: */
902 if (ml_anch) {
903 s = t = t + 1;
5e39e1e5
HS
904 if (!check)
905 goto giveup;
60e71179 906 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
907 "Looking for /%s^%s/m starting at offset %ld...\n",
908 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 909 goto try_at_offset;
66e933ab 910 }
33b8afdf 911 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 912 goto fail;
6eb5f6b9
JH
913 /* Check is floating subtring. */
914 retry_floating_check:
915 t = check_at - start_shift;
916 DEBUG_r( what = "floating" );
917 goto hop_and_restart;
918 }
b7953727
JH
919 if (t != s) {
920 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 921 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
922 (long)(t - i_strpos), (long)(s - i_strpos))
923 );
924 }
925 else {
926 DEBUG_r(PerlIO_printf(Perl_debug_log,
927 "Does not contradict STCLASS...\n");
928 );
929 }
6eb5f6b9 930 }
5e39e1e5
HS
931 giveup:
932 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
933 PL_colors[4], (check ? "Guessed" : "Giving up"),
934 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 935 return s;
2c2d71f5
JH
936
937 fail_finish: /* Substring not found */
33b8afdf
JH
938 if (prog->check_substr || prog->check_utf8) /* could be removed already */
939 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 940 fail:
2c2d71f5 941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
942 PL_colors[4],PL_colors[5]));
943 return Nullch;
944}
9661b544 945
6eb5f6b9 946/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
947STATIC char *
948S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 949{
6eb5f6b9
JH
950 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
951 char *m;
d8093b23
G
952 STRLEN ln;
953 unsigned int c1;
954 unsigned int c2;
6eb5f6b9
JH
955 char *e;
956 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 957 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 958
6eb5f6b9
JH
959 /* We know what class it must start with. */
960 switch (OP(c)) {
6eb5f6b9 961 case ANYOF:
388cc4de
HS
962 if (do_utf8) {
963 while (s < strend) {
964 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
965 !UTF8_IS_INVARIANT((U8)s[0]) ?
966 reginclass(c, (U8*)s, 0, do_utf8) :
967 REGINCLASS(c, (U8*)s)) {
968 if (tmp && (norun || regtry(prog, s)))
969 goto got_it;
970 else
971 tmp = doevery;
972 }
973 else
974 tmp = 1;
975 s += UTF8SKIP(s);
976 }
977 }
978 else {
979 while (s < strend) {
980 STRLEN skip = 1;
981
982 if (REGINCLASS(c, (U8*)s) ||
983 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
984 /* The assignment of 2 is intentional:
985 * for the folded sharp s, the skip is 2. */
986 (skip = SHARP_S_SKIP))) {
987 if (tmp && (norun || regtry(prog, s)))
988 goto got_it;
989 else
990 tmp = doevery;
991 }
992 else
993 tmp = 1;
994 s += skip;
995 }
a0d0e21e 996 }
6eb5f6b9 997 break;
f33976b4
DB
998 case CANY:
999 while (s < strend) {
1000 if (tmp && (norun || regtry(prog, s)))
1001 goto got_it;
1002 else
1003 tmp = doevery;
1004 s++;
1005 }
1006 break;
6eb5f6b9
JH
1007 case EXACTF:
1008 m = STRING(c);
1009 ln = STR_LEN(c);
1aa99e6b 1010 if (UTF) {
a2a2844f 1011 STRLEN ulen1, ulen2;
e7ae6809
JH
1012 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1013 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1014
1015 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1016 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1017
872c91ae
JH
1018 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1019 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1020 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1021 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1022 }
1023 else {
1024 c1 = *(U8*)m;
1025 c2 = PL_fold[c1];
1026 }
6eb5f6b9
JH
1027 goto do_exactf;
1028 case EXACTFL:
1029 m = STRING(c);
1030 ln = STR_LEN(c);
d8093b23 1031 c1 = *(U8*)m;
6eb5f6b9
JH
1032 c2 = PL_fold_locale[c1];
1033 do_exactf:
6795a346 1034 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1035
6eb5f6b9
JH
1036 if (norun && e < s)
1037 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1038
60a8b682
JH
1039 /* The idea in the EXACTF* cases is to first find the
1040 * first character of the EXACTF* node and then, if
1041 * necessary, case-insensitively compare the full
1042 * text of the node. The c1 and c2 are the first
1043 * characters (though in Unicode it gets a bit
1044 * more complicated because there are more cases
7f16dd3d
JH
1045 * than just upper and lower: one needs to use
1046 * the so-called folding case for case-insensitive
1047 * matching (called "loose matching" in Unicode).
1048 * ibcmp_utf8() will do just that. */
60a8b682 1049
1aa99e6b 1050 if (do_utf8) {
575cac57
JH
1051 UV c, f;
1052 U8 tmpbuf [UTF8_MAXLEN+1];
1053 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1054 STRLEN len, foldlen;
d7f013c8 1055
09091399 1056 if (c1 == c2) {
1aa99e6b 1057 while (s <= e) {
872c91ae
JH
1058 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1059 ckWARN(WARN_UTF8) ?
1060 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1061 if ( c == c1
1062 && (ln == len ||
66423254 1063 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1064 m, (char **)0, ln, (bool)UTF))
55da9344 1065 && (norun || regtry(prog, s)) )
1aa99e6b 1066 goto got_it;
80aecb99
JH
1067 else {
1068 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1069 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1070 if ( f != c
1071 && (f == c1 || f == c2)
1072 && (ln == foldlen ||
66423254
JH
1073 !ibcmp_utf8((char *) foldbuf,
1074 (char **)0, foldlen, do_utf8,
d07ddd77 1075 m,
eb160463 1076 (char **)0, ln, (bool)UTF))
80aecb99
JH
1077 && (norun || regtry(prog, s)) )
1078 goto got_it;
1079 }
1aa99e6b
IH
1080 s += len;
1081 }
09091399
JH
1082 }
1083 else {
1aa99e6b 1084 while (s <= e) {
872c91ae
JH
1085 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1086 ckWARN(WARN_UTF8) ?
1087 0 : UTF8_ALLOW_ANY);
80aecb99 1088
60a8b682 1089 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1090 * Note that not all the possible combinations
1091 * are handled here: some of them are handled
1092 * by the standard folding rules, and some of
1093 * them (the character class or ANYOF cases)
1094 * are handled during compiletime in
1095 * regexec.c:S_regclass(). */
880bd946
JH
1096 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1097 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1098 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1099
1100 if ( (c == c1 || c == c2)
1101 && (ln == len ||
66423254 1102 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1103 m, (char **)0, ln, (bool)UTF))
55da9344 1104 && (norun || regtry(prog, s)) )
1aa99e6b 1105 goto got_it;
80aecb99
JH
1106 else {
1107 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1108 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1109 if ( f != c
1110 && (f == c1 || f == c2)
1111 && (ln == foldlen ||
a6872d42 1112 !ibcmp_utf8((char *) foldbuf,
66423254 1113 (char **)0, foldlen, do_utf8,
d07ddd77 1114 m,
eb160463 1115 (char **)0, ln, (bool)UTF))
80aecb99
JH
1116 && (norun || regtry(prog, s)) )
1117 goto got_it;
1118 }
1aa99e6b
IH
1119 s += len;
1120 }
09091399 1121 }
1aa99e6b
IH
1122 }
1123 else {
1124 if (c1 == c2)
1125 while (s <= e) {
1126 if ( *(U8*)s == c1
1127 && (ln == 1 || !(OP(c) == EXACTF
1128 ? ibcmp(s, m, ln)
1129 : ibcmp_locale(s, m, ln)))
1130 && (norun || regtry(prog, s)) )
1131 goto got_it;
1132 s++;
1133 }
1134 else
1135 while (s <= e) {
1136 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1137 && (ln == 1 || !(OP(c) == EXACTF
1138 ? ibcmp(s, m, ln)
1139 : ibcmp_locale(s, m, ln)))
1140 && (norun || regtry(prog, s)) )
1141 goto got_it;
1142 s++;
1143 }
b3c9acc1
IZ
1144 }
1145 break;
bbce6d69 1146 case BOUNDL:
3280af22 1147 PL_reg_flags |= RF_tainted;
bbce6d69 1148 /* FALL THROUGH */
a0d0e21e 1149 case BOUND:
ffc61ed2 1150 if (do_utf8) {
12d33761 1151 if (s == PL_bostr)
ffc61ed2
JH
1152 tmp = '\n';
1153 else {
b4f7163a 1154 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1155
b4f7163a 1156 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1157 }
1158 tmp = ((OP(c) == BOUND ?
9041c2e3 1159 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1160 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1161 while (s < strend) {
1162 if (tmp == !(OP(c) == BOUND ?
3568d838 1163 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1164 isALNUM_LC_utf8((U8*)s)))
1165 {
1166 tmp = !tmp;
1167 if ((norun || regtry(prog, s)))
1168 goto got_it;
1169 }
1170 s += UTF8SKIP(s);
a687059c 1171 }
a0d0e21e 1172 }
667bb95a 1173 else {
12d33761 1174 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1175 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1176 while (s < strend) {
1177 if (tmp ==
1178 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1179 tmp = !tmp;
1180 if ((norun || regtry(prog, s)))
1181 goto got_it;
1182 }
1183 s++;
a0ed51b3 1184 }
a0ed51b3 1185 }
6eb5f6b9 1186 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1187 goto got_it;
1188 break;
bbce6d69 1189 case NBOUNDL:
3280af22 1190 PL_reg_flags |= RF_tainted;
bbce6d69 1191 /* FALL THROUGH */
a0d0e21e 1192 case NBOUND:
ffc61ed2 1193 if (do_utf8) {
12d33761 1194 if (s == PL_bostr)
ffc61ed2
JH
1195 tmp = '\n';
1196 else {
b4f7163a 1197 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1198
b4f7163a 1199 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1200 }
1201 tmp = ((OP(c) == NBOUND ?
9041c2e3 1202 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1203 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1204 while (s < strend) {
1205 if (tmp == !(OP(c) == NBOUND ?
3568d838 1206 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1207 isALNUM_LC_utf8((U8*)s)))
1208 tmp = !tmp;
1209 else if ((norun || regtry(prog, s)))
1210 goto got_it;
1211 s += UTF8SKIP(s);
1212 }
a0d0e21e 1213 }
667bb95a 1214 else {
12d33761 1215 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1216 tmp = ((OP(c) == NBOUND ?
1217 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1218 while (s < strend) {
1219 if (tmp ==
1220 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1221 tmp = !tmp;
1222 else if ((norun || regtry(prog, s)))
1223 goto got_it;
1224 s++;
1225 }
a0ed51b3 1226 }
6eb5f6b9 1227 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1228 goto got_it;
1229 break;
a0d0e21e 1230 case ALNUM:
ffc61ed2 1231 if (do_utf8) {
8269fa76 1232 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1233 while (s < strend) {
3568d838 1234 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1235 if (tmp && (norun || regtry(prog, s)))
1236 goto got_it;
1237 else
1238 tmp = doevery;
1239 }
bbce6d69 1240 else
ffc61ed2
JH
1241 tmp = 1;
1242 s += UTF8SKIP(s);
bbce6d69 1243 }
bbce6d69 1244 }
ffc61ed2
JH
1245 else {
1246 while (s < strend) {
1247 if (isALNUM(*s)) {
1248 if (tmp && (norun || regtry(prog, s)))
1249 goto got_it;
1250 else
1251 tmp = doevery;
1252 }
a0ed51b3 1253 else
ffc61ed2
JH
1254 tmp = 1;
1255 s++;
a0ed51b3 1256 }
a0ed51b3
LW
1257 }
1258 break;
bbce6d69 1259 case ALNUML:
3280af22 1260 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1261 if (do_utf8) {
1262 while (s < strend) {
1263 if (isALNUM_LC_utf8((U8*)s)) {
1264 if (tmp && (norun || regtry(prog, s)))
1265 goto got_it;
1266 else
1267 tmp = doevery;
1268 }
a687059c 1269 else
ffc61ed2
JH
1270 tmp = 1;
1271 s += UTF8SKIP(s);
a0d0e21e 1272 }
a0d0e21e 1273 }
ffc61ed2
JH
1274 else {
1275 while (s < strend) {
1276 if (isALNUM_LC(*s)) {
1277 if (tmp && (norun || regtry(prog, s)))
1278 goto got_it;
1279 else
1280 tmp = doevery;
1281 }
a0ed51b3 1282 else
ffc61ed2
JH
1283 tmp = 1;
1284 s++;
a0ed51b3 1285 }
a0ed51b3
LW
1286 }
1287 break;
a0d0e21e 1288 case NALNUM:
ffc61ed2 1289 if (do_utf8) {
8269fa76 1290 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1291 while (s < strend) {
3568d838 1292 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1293 if (tmp && (norun || regtry(prog, s)))
1294 goto got_it;
1295 else
1296 tmp = doevery;
1297 }
bbce6d69 1298 else
ffc61ed2
JH
1299 tmp = 1;
1300 s += UTF8SKIP(s);
bbce6d69 1301 }
bbce6d69 1302 }
ffc61ed2
JH
1303 else {
1304 while (s < strend) {
1305 if (!isALNUM(*s)) {
1306 if (tmp && (norun || regtry(prog, s)))
1307 goto got_it;
1308 else
1309 tmp = doevery;
1310 }
a0ed51b3 1311 else
ffc61ed2
JH
1312 tmp = 1;
1313 s++;
a0ed51b3 1314 }
a0ed51b3
LW
1315 }
1316 break;
bbce6d69 1317 case NALNUML:
3280af22 1318 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1319 if (do_utf8) {
1320 while (s < strend) {
1321 if (!isALNUM_LC_utf8((U8*)s)) {
1322 if (tmp && (norun || regtry(prog, s)))
1323 goto got_it;
1324 else
1325 tmp = doevery;
1326 }
a687059c 1327 else
ffc61ed2
JH
1328 tmp = 1;
1329 s += UTF8SKIP(s);
a687059c 1330 }
a0d0e21e 1331 }
ffc61ed2
JH
1332 else {
1333 while (s < strend) {
1334 if (!isALNUM_LC(*s)) {
1335 if (tmp && (norun || regtry(prog, s)))
1336 goto got_it;
1337 else
1338 tmp = doevery;
1339 }
a0ed51b3 1340 else
ffc61ed2
JH
1341 tmp = 1;
1342 s++;
a0ed51b3 1343 }
a0ed51b3
LW
1344 }
1345 break;
a0d0e21e 1346 case SPACE:
ffc61ed2 1347 if (do_utf8) {
8269fa76 1348 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1349 while (s < strend) {
3568d838 1350 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1351 if (tmp && (norun || regtry(prog, s)))
1352 goto got_it;
1353 else
1354 tmp = doevery;
1355 }
a0d0e21e 1356 else
ffc61ed2
JH
1357 tmp = 1;
1358 s += UTF8SKIP(s);
2304df62 1359 }
a0d0e21e 1360 }
ffc61ed2
JH
1361 else {
1362 while (s < strend) {
1363 if (isSPACE(*s)) {
1364 if (tmp && (norun || regtry(prog, s)))
1365 goto got_it;
1366 else
1367 tmp = doevery;
1368 }
a0ed51b3 1369 else
ffc61ed2
JH
1370 tmp = 1;
1371 s++;
a0ed51b3 1372 }
a0ed51b3
LW
1373 }
1374 break;
bbce6d69 1375 case SPACEL:
3280af22 1376 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1377 if (do_utf8) {
1378 while (s < strend) {
1379 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1380 if (tmp && (norun || regtry(prog, s)))
1381 goto got_it;
1382 else
1383 tmp = doevery;
1384 }
bbce6d69 1385 else
ffc61ed2
JH
1386 tmp = 1;
1387 s += UTF8SKIP(s);
bbce6d69 1388 }
bbce6d69 1389 }
ffc61ed2
JH
1390 else {
1391 while (s < strend) {
1392 if (isSPACE_LC(*s)) {
1393 if (tmp && (norun || regtry(prog, s)))
1394 goto got_it;
1395 else
1396 tmp = doevery;
1397 }
a0ed51b3 1398 else
ffc61ed2
JH
1399 tmp = 1;
1400 s++;
a0ed51b3 1401 }
a0ed51b3
LW
1402 }
1403 break;
a0d0e21e 1404 case NSPACE:
ffc61ed2 1405 if (do_utf8) {
8269fa76 1406 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1407 while (s < strend) {
3568d838 1408 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1409 if (tmp && (norun || regtry(prog, s)))
1410 goto got_it;
1411 else
1412 tmp = doevery;
1413 }
a0d0e21e 1414 else
ffc61ed2
JH
1415 tmp = 1;
1416 s += UTF8SKIP(s);
a687059c 1417 }
a0d0e21e 1418 }
ffc61ed2
JH
1419 else {
1420 while (s < strend) {
1421 if (!isSPACE(*s)) {
1422 if (tmp && (norun || regtry(prog, s)))
1423 goto got_it;
1424 else
1425 tmp = doevery;
1426 }
a0ed51b3 1427 else
ffc61ed2
JH
1428 tmp = 1;
1429 s++;
a0ed51b3 1430 }
a0ed51b3
LW
1431 }
1432 break;
bbce6d69 1433 case NSPACEL:
3280af22 1434 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1435 if (do_utf8) {
1436 while (s < strend) {
1437 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1438 if (tmp && (norun || regtry(prog, s)))
1439 goto got_it;
1440 else
1441 tmp = doevery;
1442 }
bbce6d69 1443 else
ffc61ed2
JH
1444 tmp = 1;
1445 s += UTF8SKIP(s);
bbce6d69 1446 }
bbce6d69 1447 }
ffc61ed2
JH
1448 else {
1449 while (s < strend) {
1450 if (!isSPACE_LC(*s)) {
1451 if (tmp && (norun || regtry(prog, s)))
1452 goto got_it;
1453 else
1454 tmp = doevery;
1455 }
a0ed51b3 1456 else
ffc61ed2
JH
1457 tmp = 1;
1458 s++;
a0ed51b3 1459 }
a0ed51b3
LW
1460 }
1461 break;
a0d0e21e 1462 case DIGIT:
ffc61ed2 1463 if (do_utf8) {
8269fa76 1464 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1465 while (s < strend) {
3568d838 1466 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1467 if (tmp && (norun || regtry(prog, s)))
1468 goto got_it;
1469 else
1470 tmp = doevery;
1471 }
a0d0e21e 1472 else
ffc61ed2
JH
1473 tmp = 1;
1474 s += UTF8SKIP(s);
2b69d0c2 1475 }
a0d0e21e 1476 }
ffc61ed2
JH
1477 else {
1478 while (s < strend) {
1479 if (isDIGIT(*s)) {
1480 if (tmp && (norun || regtry(prog, s)))
1481 goto got_it;
1482 else
1483 tmp = doevery;
1484 }
a0ed51b3 1485 else
ffc61ed2
JH
1486 tmp = 1;
1487 s++;
a0ed51b3 1488 }
a0ed51b3
LW
1489 }
1490 break;
b8c5462f
JH
1491 case DIGITL:
1492 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1493 if (do_utf8) {
1494 while (s < strend) {
1495 if (isDIGIT_LC_utf8((U8*)s)) {
1496 if (tmp && (norun || regtry(prog, s)))
1497 goto got_it;
1498 else
1499 tmp = doevery;
1500 }
b8c5462f 1501 else
ffc61ed2
JH
1502 tmp = 1;
1503 s += UTF8SKIP(s);
b8c5462f 1504 }
b8c5462f 1505 }
ffc61ed2
JH
1506 else {
1507 while (s < strend) {
1508 if (isDIGIT_LC(*s)) {
1509 if (tmp && (norun || regtry(prog, s)))
1510 goto got_it;
1511 else
1512 tmp = doevery;
1513 }
b8c5462f 1514 else
ffc61ed2
JH
1515 tmp = 1;
1516 s++;
b8c5462f 1517 }
b8c5462f
JH
1518 }
1519 break;
a0d0e21e 1520 case NDIGIT:
ffc61ed2 1521 if (do_utf8) {
8269fa76 1522 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1523 while (s < strend) {
3568d838 1524 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1525 if (tmp && (norun || regtry(prog, s)))
1526 goto got_it;
1527 else
1528 tmp = doevery;
1529 }
a0d0e21e 1530 else
ffc61ed2
JH
1531 tmp = 1;
1532 s += UTF8SKIP(s);
a687059c 1533 }
a0d0e21e 1534 }
ffc61ed2
JH
1535 else {
1536 while (s < strend) {
1537 if (!isDIGIT(*s)) {
1538 if (tmp && (norun || regtry(prog, s)))
1539 goto got_it;
1540 else
1541 tmp = doevery;
1542 }
a0ed51b3 1543 else
ffc61ed2
JH
1544 tmp = 1;
1545 s++;
a0ed51b3 1546 }
a0ed51b3
LW
1547 }
1548 break;
b8c5462f
JH
1549 case NDIGITL:
1550 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1551 if (do_utf8) {
1552 while (s < strend) {
1553 if (!isDIGIT_LC_utf8((U8*)s)) {
1554 if (tmp && (norun || regtry(prog, s)))
1555 goto got_it;
1556 else
1557 tmp = doevery;
1558 }
b8c5462f 1559 else
ffc61ed2
JH
1560 tmp = 1;
1561 s += UTF8SKIP(s);
b8c5462f 1562 }
a0ed51b3 1563 }
ffc61ed2
JH
1564 else {
1565 while (s < strend) {
1566 if (!isDIGIT_LC(*s)) {
1567 if (tmp && (norun || regtry(prog, s)))
1568 goto got_it;
1569 else
1570 tmp = doevery;
1571 }
cf93c79d 1572 else
ffc61ed2
JH
1573 tmp = 1;
1574 s++;
b8c5462f 1575 }
b8c5462f
JH
1576 }
1577 break;
b3c9acc1 1578 default:
3c3eec57
GS
1579 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1580 break;
d6a28714 1581 }
6eb5f6b9
JH
1582 return 0;
1583 got_it:
1584 return s;
1585}
1586
1587/*
1588 - regexec_flags - match a regexp against a string
1589 */
1590I32
1591Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1592 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1593/* strend: pointer to null at end of string */
1594/* strbeg: real beginning of string */
1595/* minend: end of match must be >=minend after stringarg. */
1596/* data: May be used for some additional optimizations. */
1597/* nosave: For optimizations. */
1598{
6eb5f6b9
JH
1599 register char *s;
1600 register regnode *c;
1601 register char *startpos = stringarg;
6eb5f6b9
JH
1602 I32 minlen; /* must match at least this many chars */
1603 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1604 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1605 constant substr. */ /* CC */
1606 I32 end_shift = 0; /* Same for the end. */ /* CC */
1607 I32 scream_pos = -1; /* Internal iterator of scream. */
1608 char *scream_olds;
1609 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1610 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1611#ifdef DEBUGGING
9e55ce06
JH
1612 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1613 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1614#endif
6eb5f6b9
JH
1615
1616 PL_regcc = 0;
1617
1618 cache_re(prog);
1619#ifdef DEBUGGING
aea4f609 1620 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1621#endif
1622
1623 /* Be paranoid... */
1624 if (prog == NULL || startpos == NULL) {
1625 Perl_croak(aTHX_ "NULL regexp parameter");
1626 return 0;
1627 }
1628
1629 minlen = prog->minlen;
61a36c01 1630 if (strend - startpos < minlen) {
a72c7584
JH
1631 DEBUG_r(PerlIO_printf(Perl_debug_log,
1632 "String too short [regexec_flags]...\n"));
1633 goto phooey;
1aa99e6b 1634 }
6eb5f6b9 1635
6eb5f6b9
JH
1636 /* Check validity of program. */
1637 if (UCHARAT(prog->program) != REG_MAGIC) {
1638 Perl_croak(aTHX_ "corrupted regexp program");
1639 }
1640
1641 PL_reg_flags = 0;
1642 PL_reg_eval_set = 0;
1643 PL_reg_maxiter = 0;
1644
1645 if (prog->reganch & ROPT_UTF8)
1646 PL_reg_flags |= RF_utf8;
1647
1648 /* Mark beginning of line for ^ and lookbehind. */
1649 PL_regbol = startpos;
1650 PL_bostr = strbeg;
1651 PL_reg_sv = sv;
1652
1653 /* Mark end of line for $ (and such) */
1654 PL_regeol = strend;
1655
1656 /* see how far we have to get to not match where we matched before */
1657 PL_regtill = startpos+minend;
1658
1659 /* We start without call_cc context. */
1660 PL_reg_call_cc = 0;
1661
1662 /* If there is a "must appear" string, look for it. */
1663 s = startpos;
1664
1665 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1666 MAGIC *mg;
1667
1668 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1669 PL_reg_ganch = startpos;
1670 else if (sv && SvTYPE(sv) >= SVt_PVMG
1671 && SvMAGIC(sv)
14befaf4
DM
1672 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1673 && mg->mg_len >= 0) {
6eb5f6b9
JH
1674 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1675 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1676 if (s > PL_reg_ganch)
6eb5f6b9
JH
1677 goto phooey;
1678 s = PL_reg_ganch;
1679 }
1680 }
1681 else /* pos() not defined */
1682 PL_reg_ganch = strbeg;
1683 }
1684
33b8afdf 1685 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1686 re_scream_pos_data d;
1687
1688 d.scream_olds = &scream_olds;
1689 d.scream_pos = &scream_pos;
1690 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1691 if (!s) {
1692 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1693 goto phooey; /* not present */
3fa9c3d7 1694 }
6eb5f6b9
JH
1695 }
1696
2a782b5b 1697 DEBUG_r({
9e55ce06
JH
1698 char *s0 = UTF ?
1699 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1700 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1701 prog->precomp;
1702 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1703 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1704 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1705 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1706 if (!PL_colorset)
1707 reginitcolors();
1708 PerlIO_printf(Perl_debug_log,
9e55ce06 1709 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1710 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1711 len0, len0, s0,
2a782b5b 1712 PL_colors[1],
9e55ce06 1713 len0 > 60 ? "..." : "",
2a782b5b 1714 PL_colors[0],
9e55ce06
JH
1715 (int)(len1 > 60 ? 60 : len1),
1716 s1, PL_colors[1],
1717 (len1 > 60 ? "..." : "")
2a782b5b
JH
1718 );
1719 });
6eb5f6b9
JH
1720
1721 /* Simplest case: anchored match need be tried only once. */
1722 /* [unless only anchor is BOL and multiline is set] */
1723 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1724 if (s == startpos && regtry(prog, startpos))
1725 goto got_it;
1726 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1727 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1728 {
1729 char *end;
1730
1731 if (minlen)
1732 dontbother = minlen - 1;
1aa99e6b 1733 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1734 /* for multiline we only have to try after newlines */
33b8afdf 1735 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1736 if (s == startpos)
1737 goto after_try;
1738 while (1) {
1739 if (regtry(prog, s))
1740 goto got_it;
1741 after_try:
1742 if (s >= end)
1743 goto phooey;
1744 if (prog->reganch & RE_USE_INTUIT) {
1745 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1746 if (!s)
1747 goto phooey;
1748 }
1749 else
1750 s++;
1751 }
1752 } else {
1753 if (s > startpos)
1754 s--;
1755 while (s < end) {
1756 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1757 if (regtry(prog, s))
1758 goto got_it;
1759 }
1760 }
1761 }
1762 }
1763 goto phooey;
1764 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1765 if (regtry(prog, PL_reg_ganch))
1766 goto got_it;
1767 goto phooey;
1768 }
1769
1770 /* Messy cases: unanchored match. */
33b8afdf 1771 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1772 /* we have /x+whatever/ */
1773 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1774 char ch;
bf93d4cc
GS
1775#ifdef DEBUGGING
1776 int did_match = 0;
1777#endif
33b8afdf
JH
1778 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1779 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1780 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1781
1aa99e6b 1782 if (do_utf8) {
6eb5f6b9
JH
1783 while (s < strend) {
1784 if (*s == ch) {
bf93d4cc 1785 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1786 if (regtry(prog, s)) goto got_it;
1787 s += UTF8SKIP(s);
1788 while (s < strend && *s == ch)
1789 s += UTF8SKIP(s);
1790 }
1791 s += UTF8SKIP(s);
1792 }
1793 }
1794 else {
1795 while (s < strend) {
1796 if (*s == ch) {
bf93d4cc 1797 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1798 if (regtry(prog, s)) goto got_it;
1799 s++;
1800 while (s < strend && *s == ch)
1801 s++;
1802 }
1803 s++;
1804 }
1805 }
b7953727 1806 DEBUG_r(if (!did_match)
bf93d4cc 1807 PerlIO_printf(Perl_debug_log,
b7953727
JH
1808 "Did not find anchored character...\n")
1809 );
6eb5f6b9
JH
1810 }
1811 /*SUPPRESS 560*/
33b8afdf
JH
1812 else if (prog->anchored_substr != Nullsv
1813 || prog->anchored_utf8 != Nullsv
1814 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1815 && prog->float_max_offset < strend - s)) {
1816 SV *must;
1817 I32 back_max;
1818 I32 back_min;
1819 char *last;
6eb5f6b9 1820 char *last1; /* Last position checked before */
bf93d4cc
GS
1821#ifdef DEBUGGING
1822 int did_match = 0;
1823#endif
33b8afdf
JH
1824 if (prog->anchored_substr || prog->anchored_utf8) {
1825 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1826 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1827 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1828 back_max = back_min = prog->anchored_offset;
1829 } else {
1830 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1831 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1832 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1833 back_max = prog->float_max_offset;
1834 back_min = prog->float_min_offset;
1835 }
1836 if (must == &PL_sv_undef)
1837 /* could not downgrade utf8 check substring, so must fail */
1838 goto phooey;
1839
1840 last = HOP3c(strend, /* Cannot start after this */
1841 -(I32)(CHR_SVLEN(must)
1842 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1843
1844 if (s > PL_bostr)
1845 last1 = HOPc(s, -1);
1846 else
1847 last1 = s - 1; /* bogus */
1848
1849 /* XXXX check_substr already used to find `s', can optimize if
1850 check_substr==must. */
1851 scream_pos = -1;
1852 dontbother = end_shift;
1853 strend = HOPc(strend, -dontbother);
1854 while ( (s <= last) &&
9041c2e3 1855 ((flags & REXEC_SCREAM)
1aa99e6b 1856 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1857 end_shift, &scream_pos, 0))
1aa99e6b 1858 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1859 (unsigned char*)strend, must,
6eb5f6b9 1860 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
bf93d4cc 1861 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1862 if (HOPc(s, -back_max) > last1) {
1863 last1 = HOPc(s, -back_min);
1864 s = HOPc(s, -back_max);
1865 }
1866 else {
1867 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1868
1869 last1 = HOPc(s, -back_min);
1870 s = t;
1871 }
1aa99e6b 1872 if (do_utf8) {
6eb5f6b9
JH
1873 while (s <= last1) {
1874 if (regtry(prog, s))
1875 goto got_it;
1876 s += UTF8SKIP(s);
1877 }
1878 }
1879 else {
1880 while (s <= last1) {
1881 if (regtry(prog, s))
1882 goto got_it;
1883 s++;
1884 }
1885 }
1886 }
b7953727
JH
1887 DEBUG_r(if (!did_match)
1888 PerlIO_printf(Perl_debug_log,
1889 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1890 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1891 ? "anchored" : "floating"),
1892 PL_colors[0],
1893 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1894 SvPVX(must),
b7953727
JH
1895 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1896 );
6eb5f6b9
JH
1897 goto phooey;
1898 }
155aba94 1899 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1900 if (minlen) {
1901 I32 op = (U8)OP(prog->regstclass);
66e933ab 1902 /* don't bother with what can't match */
f14c76ed
RGS
1903 if (PL_regkind[op] != EXACT && op != CANY)
1904 strend = HOPc(strend, -(minlen - 1));
1905 }
ffc61ed2
JH
1906 DEBUG_r({
1907 SV *prop = sv_newmortal();
9e55ce06
JH
1908 char *s0;
1909 char *s1;
1910 int len0;
1911 int len1;
1912
ffc61ed2 1913 regprop(prop, c);
9e55ce06
JH
1914 s0 = UTF ?
1915 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1916 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1917 SvPVX(prop);
1918 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1919 s1 = UTF ?
c728cb41 1920 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1921 len1 = UTF ? SvCUR(dsv1) : strend - s;
1922 PerlIO_printf(Perl_debug_log,
1923 "Matching stclass `%*.*s' against `%*.*s'\n",
1924 len0, len0, s0,
1925 len1, len1, s1);
ffc61ed2 1926 });
6eb5f6b9
JH
1927 if (find_byclass(prog, c, s, strend, startpos, 0))
1928 goto got_it;
bf93d4cc 1929 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1930 }
1931 else {
1932 dontbother = 0;
33b8afdf
JH
1933 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1934 /* Trim the end. */
d6a28714 1935 char *last;
33b8afdf
JH
1936 SV* float_real;
1937
1938 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1939 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1940 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1941
1942 if (flags & REXEC_SCREAM) {
33b8afdf 1943 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1944 end_shift, &scream_pos, 1); /* last one */
1945 if (!last)
ffc61ed2 1946 last = scream_olds; /* Only one occurrence. */
b8c5462f 1947 }
d6a28714
JH
1948 else {
1949 STRLEN len;
33b8afdf 1950 char *little = SvPV(float_real, len);
d6a28714 1951
33b8afdf 1952 if (SvTAIL(float_real)) {
d6a28714
JH
1953 if (memEQ(strend - len + 1, little, len - 1))
1954 last = strend - len + 1;
1955 else if (!PL_multiline)
9041c2e3 1956 last = memEQ(strend - len, little, len)
d6a28714 1957 ? strend - len : Nullch;
b8c5462f 1958 else
d6a28714
JH
1959 goto find_last;
1960 } else {
1961 find_last:
9041c2e3 1962 if (len)
d6a28714 1963 last = rninstr(s, strend, little, little + len);
b8c5462f 1964 else
d6a28714 1965 last = strend; /* matching `$' */
b8c5462f 1966 }
b8c5462f 1967 }
bf93d4cc
GS
1968 if (last == NULL) {
1969 DEBUG_r(PerlIO_printf(Perl_debug_log,
1970 "%sCan't trim the tail, match fails (should not happen)%s\n",
1971 PL_colors[4],PL_colors[5]));
1972 goto phooey; /* Should not happen! */
1973 }
d6a28714
JH
1974 dontbother = strend - last + prog->float_min_offset;
1975 }
1976 if (minlen && (dontbother < minlen))
1977 dontbother = minlen - 1;
1978 strend -= dontbother; /* this one's always in bytes! */
1979 /* We don't know much -- general case. */
1aa99e6b 1980 if (do_utf8) {
d6a28714
JH
1981 for (;;) {
1982 if (regtry(prog, s))
1983 goto got_it;
1984 if (s >= strend)
1985 break;
b8c5462f 1986 s += UTF8SKIP(s);
d6a28714
JH
1987 };
1988 }
1989 else {
1990 do {
1991 if (regtry(prog, s))
1992 goto got_it;
1993 } while (s++ < strend);
1994 }
1995 }
1996
1997 /* Failure. */
1998 goto phooey;
1999
2000got_it:
2001 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2002
2003 if (PL_reg_eval_set) {
2004 /* Preserve the current value of $^R */
2005 if (oreplsv != GvSV(PL_replgv))
2006 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2007 restored, the value remains
2008 the same. */
acfe0abc 2009 restore_pos(aTHX_ 0);
d6a28714
JH
2010 }
2011
2012 /* make sure $`, $&, $', and $digit will work later */
2013 if ( !(flags & REXEC_NOT_FIRST) ) {
2014 if (RX_MATCH_COPIED(prog)) {
2015 Safefree(prog->subbeg);
2016 RX_MATCH_COPIED_off(prog);
2017 }
2018 if (flags & REXEC_COPY_STR) {
2019 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2020
2021 s = savepvn(strbeg, i);
2022 prog->subbeg = s;
2023 prog->sublen = i;
2024 RX_MATCH_COPIED_on(prog);
2025 }
2026 else {
2027 prog->subbeg = strbeg;
2028 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2029 }
2030 }
9041c2e3 2031
d6a28714
JH
2032 return 1;
2033
2034phooey:
bf93d4cc
GS
2035 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2036 PL_colors[4],PL_colors[5]));
d6a28714 2037 if (PL_reg_eval_set)
acfe0abc 2038 restore_pos(aTHX_ 0);
d6a28714
JH
2039 return 0;
2040}
2041
2042/*
2043 - regtry - try match at specific point
2044 */
2045STATIC I32 /* 0 failure, 1 success */
2046S_regtry(pTHX_ regexp *prog, char *startpos)
2047{
d6a28714
JH
2048 register I32 i;
2049 register I32 *sp;
2050 register I32 *ep;
2051 CHECKPOINT lastcp;
2052
02db2b7b
IZ
2053#ifdef DEBUGGING
2054 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2055#endif
d6a28714
JH
2056 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2057 MAGIC *mg;
2058
2059 PL_reg_eval_set = RS_init;
2060 DEBUG_r(DEBUG_s(
b900a521
JH
2061 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2062 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2063 ));
e8347627 2064 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2065 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2066 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2067 SAVETMPS;
2068 /* Apparently this is not needed, judging by wantarray. */
e8347627 2069 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2070 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2071
2072 if (PL_reg_sv) {
2073 /* Make $_ available to executed code. */
2074 if (PL_reg_sv != DEFSV) {
4d1ff10f 2075 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2076 SAVESPTR(DEFSV);
2077 DEFSV = PL_reg_sv;
b8c5462f 2078 }
d6a28714 2079
9041c2e3 2080 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2081 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2082 /* prepare for quick setting of pos */
14befaf4
DM
2083 sv_magic(PL_reg_sv, (SV*)0,
2084 PERL_MAGIC_regex_global, Nullch, 0);
2085 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2086 mg->mg_len = -1;
b8c5462f 2087 }
d6a28714
JH
2088 PL_reg_magic = mg;
2089 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2090 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2091 }
09687e5a 2092 if (!PL_reg_curpm) {
0f79a09d 2093 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2094#ifdef USE_ITHREADS
2095 {
2096 SV* repointer = newSViv(0);
577e12cc 2097 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2098 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2099 av_push(PL_regex_padav,repointer);
2100 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2101 PL_regex_pad = AvARRAY(PL_regex_padav);
2102 }
2103#endif
2104 }
aaa362c4 2105 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2106 PL_reg_oldcurpm = PL_curpm;
2107 PL_curpm = PL_reg_curpm;
2108 if (RX_MATCH_COPIED(prog)) {
2109 /* Here is a serious problem: we cannot rewrite subbeg,
2110 since it may be needed if this match fails. Thus
2111 $` inside (?{}) could fail... */
2112 PL_reg_oldsaved = prog->subbeg;
2113 PL_reg_oldsavedlen = prog->sublen;
2114 RX_MATCH_COPIED_off(prog);
2115 }
2116 else
2117 PL_reg_oldsaved = Nullch;
2118 prog->subbeg = PL_bostr;
2119 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2120 }
2121 prog->startp[0] = startpos - PL_bostr;
2122 PL_reginput = startpos;
2123 PL_regstartp = prog->startp;
2124 PL_regendp = prog->endp;
2125 PL_reglastparen = &prog->lastparen;
a01268b5 2126 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2127 prog->lastparen = 0;
03994de8 2128 prog->lastcloseparen = 0;
d6a28714
JH
2129 PL_regsize = 0;
2130 DEBUG_r(PL_reg_starttry = startpos);
2131 if (PL_reg_start_tmpl <= prog->nparens) {
2132 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2133 if(PL_reg_start_tmp)
2134 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2135 else
2136 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2137 }
2138
2139 /* XXXX What this code is doing here?!!! There should be no need
2140 to do this again and again, PL_reglastparen should take care of
3dd2943c 2141 this! --ilya*/
dafc8851
JH
2142
2143 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2144 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2145 * PL_reglastparen), is not needed at all by the test suite
2146 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2147 * enough, for building DynaLoader, or otherwise this
2148 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2149 * will happen. Meanwhile, this code *is* needed for the
2150 * above-mentioned test suite tests to succeed. The common theme
2151 * on those tests seems to be returning null fields from matches.
2152 * --jhi */
dafc8851 2153#if 1
d6a28714
JH
2154 sp = prog->startp;
2155 ep = prog->endp;
2156 if (prog->nparens) {
eb160463 2157 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2158 *++sp = -1;
2159 *++ep = -1;
2160 }
2161 }
dafc8851 2162#endif
02db2b7b 2163 REGCP_SET(lastcp);
d6a28714
JH
2164 if (regmatch(prog->program + 1)) {
2165 prog->endp[0] = PL_reginput - PL_bostr;
2166 return 1;
2167 }
02db2b7b 2168 REGCP_UNWIND(lastcp);
d6a28714
JH
2169 return 0;
2170}
2171
02db2b7b
IZ
2172#define RE_UNWIND_BRANCH 1
2173#define RE_UNWIND_BRANCHJ 2
2174
2175union re_unwind_t;
2176
2177typedef struct { /* XX: makes sense to enlarge it... */
2178 I32 type;
2179 I32 prev;
2180 CHECKPOINT lastcp;
2181} re_unwind_generic_t;
2182
2183typedef struct {
2184 I32 type;
2185 I32 prev;
2186 CHECKPOINT lastcp;
2187 I32 lastparen;
2188 regnode *next;
2189 char *locinput;
2190 I32 nextchr;
2191#ifdef DEBUGGING
2192 int regindent;
2193#endif
2194} re_unwind_branch_t;
2195
2196typedef union re_unwind_t {
2197 I32 type;
2198 re_unwind_generic_t generic;
2199 re_unwind_branch_t branch;
2200} re_unwind_t;
2201
8ba1375e
MJD
2202#define sayYES goto yes
2203#define sayNO goto no
e0f9d4a8 2204#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2205#define sayYES_FINAL goto yes_final
2206#define sayYES_LOUD goto yes_loud
2207#define sayNO_FINAL goto no_final
2208#define sayNO_SILENT goto do_no
2209#define saySAME(x) if (x) goto yes; else goto no
2210
2211#define REPORT_CODE_OFF 24
2212
d6a28714
JH
2213/*
2214 - regmatch - main matching routine
2215 *
2216 * Conceptually the strategy is simple: check to see whether the current
2217 * node matches, call self recursively to see whether the rest matches,
2218 * and then act accordingly. In practice we make some effort to avoid
2219 * recursion, in particular by going through "ordinary" nodes (that don't
2220 * need to know whether the rest of the match failed) by a loop instead of
2221 * by recursion.
2222 */
2223/* [lwall] I've hoisted the register declarations to the outer block in order to
2224 * maybe save a little bit of pushing and popping on the stack. It also takes
2225 * advantage of machines that use a register save mask on subroutine entry.
2226 */
2227STATIC I32 /* 0 failure, 1 success */
2228S_regmatch(pTHX_ regnode *prog)
2229{
d6a28714
JH
2230 register regnode *scan; /* Current node. */
2231 regnode *next; /* Next node. */
2232 regnode *inner; /* Next node in internal branch. */
2233 register I32 nextchr; /* renamed nextchr - nextchar colides with
2234 function of same name */
2235 register I32 n; /* no or next */
b7953727
JH
2236 register I32 ln = 0; /* len or last */
2237 register char *s = Nullch; /* operand or save */
d6a28714 2238 register char *locinput = PL_reginput;
b7953727 2239 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2240 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2241 I32 unwind = 0;
b7953727 2242#if 0
02db2b7b 2243 I32 firstcp = PL_savestack_ix;
b7953727 2244#endif
53c4c00c 2245 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2246#ifdef DEBUGGING
ce333219
JH
2247 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2248 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2249 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2250#endif
02db2b7b 2251
d6a28714
JH
2252#ifdef DEBUGGING
2253 PL_regindent++;
2254#endif
2255
2256 /* Note that nextchr is a byte even in UTF */
2257 nextchr = UCHARAT(locinput);
2258 scan = prog;
2259 while (scan != NULL) {
8ba1375e 2260
2a782b5b 2261 DEBUG_r( {
d6a28714
JH
2262 SV *prop = sv_newmortal();
2263 int docolor = *PL_colors[0];
2264 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2265 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2266 /* The part of the string before starttry has one color
2267 (pref0_len chars), between starttry and current
2268 position another one (pref_len - pref0_len chars),
2269 after the current position the third one.
2270 We assume that pref0_len <= pref_len, otherwise we
2271 decrease pref0_len. */
9041c2e3 2272 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2273 ? (5 + taill) - l : locinput - PL_bostr;
2274 int pref0_len;
d6a28714 2275
df1ffd02 2276 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2277 pref_len++;
2278 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2279 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2280 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2281 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2282 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2283 l--;
d6a28714
JH
2284 if (pref0_len < 0)
2285 pref0_len = 0;
2286 if (pref0_len > pref_len)
2287 pref0_len = pref_len;
2288 regprop(prop, scan);
2a782b5b
JH
2289 {
2290 char *s0 =
f14c76ed 2291 do_utf8 && OP(scan) != CANY ?
2a782b5b 2292 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2293 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2294 locinput - pref_len;
df1ffd02 2295 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2296 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2297 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2298 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2299 locinput - pref_len + pref0_len;
df1ffd02 2300 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2301 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2302 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2303 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2304 locinput;
df1ffd02 2305 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2306 PerlIO_printf(Perl_debug_log,
2307 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2308 (IV)(locinput - PL_bostr),
2309 PL_colors[4],
2310 len0, s0,
2311 PL_colors[5],
2312 PL_colors[2],
2313 len1, s1,
2314 PL_colors[3],
2315 (docolor ? "" : "> <"),
2316 PL_colors[0],
2317 len2, s2,
2318 PL_colors[1],
2319 15 - l - pref_len + 1,
2320 "",
2321 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2322 SvPVX(prop));
2323 }
2324 });
d6a28714
JH
2325
2326 next = scan + NEXT_OFF(scan);
2327 if (next == scan)
2328 next = NULL;
2329
2330 switch (OP(scan)) {
2331 case BOL:
12d33761
HS
2332 if (locinput == PL_bostr || (PL_multiline &&
2333 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2334 {
2335 /* regtill = regbol; */
b8c5462f
JH
2336 break;
2337 }
d6a28714
JH
2338 sayNO;
2339 case MBOL:
12d33761
HS
2340 if (locinput == PL_bostr ||
2341 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2342 {
b8c5462f
JH
2343 break;
2344 }
d6a28714
JH
2345 sayNO;
2346 case SBOL:
c2a73568 2347 if (locinput == PL_bostr)
b8c5462f 2348 break;
d6a28714
JH
2349 sayNO;
2350 case GPOS:
2351 if (locinput == PL_reg_ganch)
2352 break;
2353 sayNO;
2354 case EOL:
2355 if (PL_multiline)
2356 goto meol;
2357 else
2358 goto seol;
2359 case MEOL:
2360 meol:
2361 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2362 sayNO;
b8c5462f 2363 break;
d6a28714
JH
2364 case SEOL:
2365 seol:
2366 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2367 sayNO;
d6a28714 2368 if (PL_regeol - locinput > 1)
b8c5462f 2369 sayNO;
b8c5462f 2370 break;
d6a28714
JH
2371 case EOS:
2372 if (PL_regeol != locinput)
b8c5462f 2373 sayNO;
d6a28714 2374 break;
ffc61ed2 2375 case SANY:
d6a28714 2376 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2377 sayNO;
f33976b4
DB
2378 if (do_utf8) {
2379 locinput += PL_utf8skip[nextchr];
2380 if (locinput > PL_regeol)
2381 sayNO;
2382 nextchr = UCHARAT(locinput);
2383 }
2384 else
2385 nextchr = UCHARAT(++locinput);
2386 break;
2387 case CANY:
2388 if (!nextchr && locinput >= PL_regeol)
2389 sayNO;
b8c5462f 2390 nextchr = UCHARAT(++locinput);
a0d0e21e 2391 break;
ffc61ed2 2392 case REG_ANY:
1aa99e6b
IH
2393 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2394 sayNO;
2395 if (do_utf8) {
b8c5462f 2396 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2397 if (locinput > PL_regeol)
2398 sayNO;
a0ed51b3 2399 nextchr = UCHARAT(locinput);
a0ed51b3 2400 }
1aa99e6b
IH
2401 else
2402 nextchr = UCHARAT(++locinput);
a0ed51b3 2403 break;
d6a28714 2404 case EXACT:
cd439c50
IZ
2405 s = STRING(scan);
2406 ln = STR_LEN(scan);
eb160463 2407 if (do_utf8 != UTF) {
bc517b45 2408 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2409 char *l = locinput;
2410 char *e = s + ln;
bc517b45 2411 STRLEN ulen;
a72c7584 2412
5ff6fc6d
JH
2413 if (do_utf8) {
2414 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2415 while (s < e) {
2416 if (l >= PL_regeol)
5ff6fc6d
JH
2417 sayNO;
2418 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2419 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2420 ckWARN(WARN_UTF8) ?
2421 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2422 sayNO;
bc517b45 2423 l += ulen;
5ff6fc6d 2424 s ++;
1aa99e6b 2425 }
5ff6fc6d
JH
2426 }
2427 else {
2428 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2429 while (s < e) {
2430 if (l >= PL_regeol)
2431 sayNO;
5ff6fc6d 2432 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2433 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2434 ckWARN(WARN_UTF8) ?
2435 0 : UTF8_ALLOW_ANY))
1aa99e6b 2436 sayNO;
bc517b45 2437 s += ulen;
a72c7584 2438 l ++;
1aa99e6b 2439 }
5ff6fc6d 2440 }
1aa99e6b
IH
2441 locinput = l;
2442 nextchr = UCHARAT(locinput);
2443 break;
2444 }
bc517b45 2445 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2446 /* Inline the first character, for speed. */
2447 if (UCHARAT(s) != nextchr)
2448 sayNO;
2449 if (PL_regeol - locinput < ln)
2450 sayNO;
2451 if (ln > 1 && memNE(s, locinput, ln))
2452 sayNO;
2453 locinput += ln;
2454 nextchr = UCHARAT(locinput);
2455 break;
2456 case EXACTFL:
b8c5462f
JH
2457 PL_reg_flags |= RF_tainted;
2458 /* FALL THROUGH */
d6a28714 2459 case EXACTF:
cd439c50
IZ
2460 s = STRING(scan);
2461 ln = STR_LEN(scan);
d6a28714 2462
d07ddd77
JH
2463 if (do_utf8 || UTF) {
2464 /* Either target or the pattern are utf8. */
d6a28714 2465 char *l = locinput;
d07ddd77 2466 char *e = PL_regeol;
bc517b45 2467
eb160463 2468 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2469 l, &e, 0, do_utf8)) {
5486206c
JH
2470 /* One more case for the sharp s:
2471 * pack("U0U*", 0xDF) =~ /ss/i,
2472 * the 0xC3 0x9F are the UTF-8
2473 * byte sequence for the U+00DF. */
2474 if (!(do_utf8 &&
2475 toLOWER(s[0]) == 's' &&
2476 ln >= 2 &&
2477 toLOWER(s[1]) == 's' &&
2478 (U8)l[0] == 0xC3 &&
2479 e - l >= 2 &&
2480 (U8)l[1] == 0x9F))
2481 sayNO;
2482 }
d07ddd77
JH
2483 locinput = e;
2484 nextchr = UCHARAT(locinput);
2485 break;
a0ed51b3 2486 }
d6a28714 2487
bc517b45
JH
2488 /* Neither the target and the pattern are utf8. */
2489
d6a28714
JH
2490 /* Inline the first character, for speed. */
2491 if (UCHARAT(s) != nextchr &&
2492 UCHARAT(s) != ((OP(scan) == EXACTF)
2493 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2494 sayNO;
d6a28714 2495 if (PL_regeol - locinput < ln)
b8c5462f 2496 sayNO;
d6a28714
JH
2497 if (ln > 1 && (OP(scan) == EXACTF
2498 ? ibcmp(s, locinput, ln)
2499 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2500 sayNO;
d6a28714
JH
2501 locinput += ln;
2502 nextchr = UCHARAT(locinput);
a0d0e21e 2503 break;
d6a28714 2504 case ANYOF:
ffc61ed2 2505 if (do_utf8) {
9e55ce06
JH
2506 STRLEN inclasslen = PL_regeol - locinput;
2507
ba7b4546 2508 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2509 sayNO_ANYOF;
ffc61ed2
JH
2510 if (locinput >= PL_regeol)
2511 sayNO;
0f0076b4 2512 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2513 nextchr = UCHARAT(locinput);
e0f9d4a8 2514 break;
ffc61ed2
JH
2515 }
2516 else {
2517 if (nextchr < 0)
2518 nextchr = UCHARAT(locinput);
7d3e948e 2519 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2520 sayNO_ANYOF;
ffc61ed2
JH
2521 if (!nextchr && locinput >= PL_regeol)
2522 sayNO;
2523 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2524 break;
2525 }
2526 no_anyof:
2527 /* If we might have the case of the German sharp s
2528 * in a casefolding Unicode character class. */
2529
ebc501f0
JH
2530 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2531 locinput += SHARP_S_SKIP;
e0f9d4a8 2532 nextchr = UCHARAT(locinput);
ffc61ed2 2533 }
e0f9d4a8
JH
2534 else
2535 sayNO;
b8c5462f 2536 break;
d6a28714 2537 case ALNUML:
b8c5462f
JH
2538 PL_reg_flags |= RF_tainted;
2539 /* FALL THROUGH */
d6a28714 2540 case ALNUM:
b8c5462f 2541 if (!nextchr)
4633a7c4 2542 sayNO;
ffc61ed2 2543 if (do_utf8) {
ad24be35 2544 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2545 if (!(OP(scan) == ALNUM
3568d838 2546 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2547 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2548 {
2549 sayNO;
a0ed51b3 2550 }
b8c5462f 2551 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2552 nextchr = UCHARAT(locinput);
2553 break;
2554 }
ffc61ed2 2555 if (!(OP(scan) == ALNUM
d6a28714 2556 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2557 sayNO;
b8c5462f 2558 nextchr = UCHARAT(++locinput);
a0d0e21e 2559 break;
d6a28714 2560 case NALNUML:
b8c5462f
JH
2561 PL_reg_flags |= RF_tainted;
2562 /* FALL THROUGH */
d6a28714
JH
2563 case NALNUM:
2564 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2565 sayNO;
ffc61ed2 2566 if (do_utf8) {
8269fa76 2567 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2568 if (OP(scan) == NALNUM
3568d838 2569 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2570 : isALNUM_LC_utf8((U8*)locinput))
2571 {
b8c5462f 2572 sayNO;
d6a28714 2573 }
b8c5462f
JH
2574 locinput += PL_utf8skip[nextchr];
2575 nextchr = UCHARAT(locinput);
2576 break;
2577 }
ffc61ed2 2578 if (OP(scan) == NALNUM
d6a28714 2579 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2580 sayNO;
76e3520e 2581 nextchr = UCHARAT(++locinput);
a0d0e21e 2582 break;
d6a28714
JH
2583 case BOUNDL:
2584 case NBOUNDL:
3280af22 2585 PL_reg_flags |= RF_tainted;
bbce6d69 2586 /* FALL THROUGH */
d6a28714
JH
2587 case BOUND:
2588 case NBOUND:
2589 /* was last char in word? */
ffc61ed2 2590 if (do_utf8) {
12d33761
HS
2591 if (locinput == PL_bostr)
2592 ln = '\n';
ffc61ed2 2593 else {
b4f7163a 2594 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2595
b4f7163a 2596 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2597 }
2598 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2599 ln = isALNUM_uni(ln);
8269fa76 2600 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2601 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2602 }
2603 else {
9041c2e3 2604 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2605 n = isALNUM_LC_utf8((U8*)locinput);
2606 }
a0ed51b3 2607 }
d6a28714 2608 else {
12d33761
HS
2609 ln = (locinput != PL_bostr) ?
2610 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2611 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2612 ln = isALNUM(ln);
2613 n = isALNUM(nextchr);
2614 }
2615 else {
2616 ln = isALNUM_LC(ln);
2617 n = isALNUM_LC(nextchr);
2618 }
d6a28714 2619 }
ffc61ed2
JH
2620 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2621 OP(scan) == BOUNDL))
2622 sayNO;
a0ed51b3 2623 break;
d6a28714 2624 case SPACEL:
3280af22 2625 PL_reg_flags |= RF_tainted;
bbce6d69 2626 /* FALL THROUGH */
d6a28714 2627 case SPACE:
9442cb0e 2628 if (!nextchr)
4633a7c4 2629 sayNO;
1aa99e6b 2630 if (do_utf8) {
fd400ab9 2631 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2632 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2633 if (!(OP(scan) == SPACE
3568d838 2634 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2635 : isSPACE_LC_utf8((U8*)locinput)))
2636 {
2637 sayNO;
2638 }
2639 locinput += PL_utf8skip[nextchr];
2640 nextchr = UCHARAT(locinput);
2641 break;
d6a28714 2642 }
ffc61ed2
JH
2643 if (!(OP(scan) == SPACE
2644 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2645 sayNO;
2646 nextchr = UCHARAT(++locinput);
2647 }
2648 else {
2649 if (!(OP(scan) == SPACE
2650 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2651 sayNO;
2652 nextchr = UCHARAT(++locinput);
a0ed51b3 2653 }
a0ed51b3 2654 break;
d6a28714 2655 case NSPACEL:
3280af22 2656 PL_reg_flags |= RF_tainted;
bbce6d69 2657 /* FALL THROUGH */
d6a28714 2658 case NSPACE:
9442cb0e 2659 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2660 sayNO;
1aa99e6b 2661 if (do_utf8) {
8269fa76 2662 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2663 if (OP(scan) == NSPACE
3568d838 2664 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2665 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2666 {
2667 sayNO;
2668 }
2669 locinput += PL_utf8skip[nextchr];
2670 nextchr = UCHARAT(locinput);
2671 break;
a0ed51b3 2672 }
ffc61ed2 2673 if (OP(scan) == NSPACE
d6a28714 2674 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2675 sayNO;
76e3520e 2676 nextchr = UCHARAT(++locinput);
a0d0e21e 2677 break;
d6a28714 2678 case DIGITL:
a0ed51b3
LW
2679 PL_reg_flags |= RF_tainted;
2680 /* FALL THROUGH */
d6a28714 2681 case DIGIT:
9442cb0e 2682 if (!nextchr)
a0ed51b3 2683 sayNO;
1aa99e6b 2684 if (do_utf8) {
8269fa76 2685 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2686 if (!(OP(scan) == DIGIT
3568d838 2687 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2688 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2689 {
a0ed51b3 2690 sayNO;
dfe13c55 2691 }
6f06b55f 2692 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2693 nextchr = UCHARAT(locinput);
2694 break;
2695 }
ffc61ed2 2696 if (!(OP(scan) == DIGIT
9442cb0e 2697 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2698 sayNO;
2699 nextchr = UCHARAT(++locinput);
2700 break;
d6a28714 2701 case NDIGITL:
b8c5462f
JH
2702 PL_reg_flags |= RF_tainted;
2703 /* FALL THROUGH */
d6a28714 2704 case NDIGIT:
9442cb0e 2705 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2706 sayNO;
1aa99e6b 2707 if (do_utf8) {
8269fa76 2708 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2709 if (OP(scan) == NDIGIT
3568d838 2710 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2711 : isDIGIT_LC_utf8((U8*)locinput))
2712 {
a0ed51b3 2713 sayNO;
9442cb0e 2714 }
6f06b55f 2715 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2716 nextchr = UCHARAT(locinput);
2717 break;
2718 }
ffc61ed2 2719 if (OP(scan) == NDIGIT
9442cb0e 2720 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2721 sayNO;
2722 nextchr = UCHARAT(++locinput);
2723 break;
2724 case CLUMP:
b7c83a7e 2725 if (locinput >= PL_regeol)
a0ed51b3 2726 sayNO;
b7c83a7e
JH
2727 if (do_utf8) {
2728 LOAD_UTF8_CHARCLASS(mark,"~");
2729 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2730 sayNO;
2731 locinput += PL_utf8skip[nextchr];
2732 while (locinput < PL_regeol &&
2733 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2734 locinput += UTF8SKIP(locinput);
2735 if (locinput > PL_regeol)
2736 sayNO;
eb08e2da
JH
2737 }
2738 else
2739 locinput++;
a0ed51b3
LW
2740 nextchr = UCHARAT(locinput);
2741 break;
c8756f30 2742 case REFFL:
3280af22 2743 PL_reg_flags |= RF_tainted;
c8756f30 2744 /* FALL THROUGH */
c277df42 2745 case REF:
c8756f30 2746 case REFF:
c277df42 2747 n = ARG(scan); /* which paren pair */
cf93c79d 2748 ln = PL_regstartp[n];
2c2d71f5 2749 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2750 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2751 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2752 if (ln == PL_regendp[n])
a0d0e21e 2753 break;
a0ed51b3 2754
cf93c79d 2755 s = PL_bostr + ln;
1aa99e6b 2756 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2757 char *l = locinput;
cf93c79d 2758 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2759 /*
2760 * Note that we can't do the "other character" lookup trick as
2761 * in the 8-bit case (no pun intended) because in Unicode we
2762 * have to map both upper and title case to lower case.
2763 */
2764 if (OP(scan) == REFF) {
a2a2844f 2765 STRLEN ulen1, ulen2;
e7ae6809
JH
2766 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2767 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2768 while (s < e) {
2769 if (l >= PL_regeol)
2770 sayNO;
a2a2844f
JH
2771 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2772 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2773 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2774 sayNO;
a2a2844f
JH
2775 s += ulen1;
2776 l += ulen2;
a0ed51b3
LW
2777 }
2778 }
2779 locinput = l;
2780 nextchr = UCHARAT(locinput);
2781 break;
2782 }
2783
a0d0e21e 2784 /* Inline the first character, for speed. */
76e3520e 2785 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2786 (OP(scan) == REF ||
2787 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2788 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2789 sayNO;
cf93c79d 2790 ln = PL_regendp[n] - ln;
3280af22 2791 if (locinput + ln > PL_regeol)
4633a7c4 2792 sayNO;
c8756f30
AK
2793 if (ln > 1 && (OP(scan) == REF
2794 ? memNE(s, locinput, ln)
2795 : (OP(scan) == REFF
2796 ? ibcmp(s, locinput, ln)
2797 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2798 sayNO;
a0d0e21e 2799 locinput += ln;
76e3520e 2800 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2801 break;
2802
2803 case NOTHING:
c277df42 2804 case TAIL:
a0d0e21e
LW
2805 break;
2806 case BACK:
2807 break;
c277df42
IZ
2808 case EVAL:
2809 {
2810 dSP;
533c011a 2811 OP_4tree *oop = PL_op;
3280af22 2812 COP *ocurcop = PL_curcop;
f3548bdc 2813 PAD *old_comppad;
c277df42 2814 SV *ret;
9041c2e3 2815
c277df42 2816 n = ARG(scan);
533c011a 2817 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2818 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2819 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2820 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2821
8e5e9ebe
RGS
2822 {
2823 SV **before = SP;
2824 CALLRUNOPS(aTHX); /* Scalar context. */
2825 SPAGAIN;
2826 if (SP == before)
075aa684 2827 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2828 else {
2829 ret = POPs;
2830 PUTBACK;
2831 }
2832 }
2833
0f5d15d6 2834 PL_op = oop;
f3548bdc 2835 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2836 PL_curcop = ocurcop;
c277df42 2837 if (logical) {
0f5d15d6
IZ
2838 if (logical == 2) { /* Postponed subexpression. */
2839 regexp *re;
22c35a8c 2840 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2841 re_cc_state state;
0f5d15d6 2842 CHECKPOINT cp, lastcp;
cb50f42d 2843 int toggleutf;
0f5d15d6
IZ
2844
2845 if(SvROK(ret) || SvRMAGICAL(ret)) {
2846 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2847
2848 if(SvMAGICAL(sv))
14befaf4 2849 mg = mg_find(sv, PERL_MAGIC_qr);
0f5d15d6
IZ
2850 }
2851 if (mg) {
2852 re = (regexp *)mg->mg_obj;
df0003d4 2853 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2854 }
2855 else {
2856 STRLEN len;
2857 char *t = SvPV(ret, len);
2858 PMOP pm;
2859 char *oprecomp = PL_regprecomp;
2860 I32 osize = PL_regsize;
2861 I32 onpar = PL_regnpar;
2862
5fcd1c1b 2863 Zero(&pm, 1, PMOP);
cb50f42d 2864 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2865 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2866 if (!(SvFLAGS(ret)
0f5d15d6 2867 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
14befaf4
DM
2868 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2869 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2870 PL_regprecomp = oprecomp;
2871 PL_regsize = osize;
2872 PL_regnpar = onpar;
2873 }
2874 DEBUG_r(
9041c2e3 2875 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2876 "Entering embedded `%s%.60s%s%s'\n",
2877 PL_colors[0],
2878 re->precomp,
2879 PL_colors[1],
2880 (strlen(re->precomp) > 60 ? "..." : ""))
2881 );
2882 state.node = next;
2883 state.prev = PL_reg_call_cc;
2884 state.cc = PL_regcc;
2885 state.re = PL_reg_re;
2886
2ab05381 2887 PL_regcc = 0;
9041c2e3 2888
0f5d15d6 2889 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2890 REGCP_SET(lastcp);
0f5d15d6
IZ
2891 cache_re(re);
2892 state.ss = PL_savestack_ix;
2893 *PL_reglastparen = 0;
a01268b5 2894 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2895 PL_reg_call_cc = &state;
2896 PL_reginput = locinput;
cb50f42d
YST
2897 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2898 ((re->reganch & ROPT_UTF8) != 0);
2899 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2900
2901 /* XXXX This is too dramatic a measure... */
2902 PL_reg_maxiter = 0;
2903
0f5d15d6 2904 if (regmatch(re->program + 1)) {
2c914db6
IZ
2905 /* Even though we succeeded, we need to restore
2906 global variables, since we may be wrapped inside
2907 SUSPEND, thus the match may be not finished yet. */
2908
2909 /* XXXX Do this only if SUSPENDed? */
2910 PL_reg_call_cc = state.prev;
2911 PL_regcc = state.cc;
2912 PL_reg_re = state.re;
2913 cache_re(PL_reg_re);
cb50f42d 2914 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2915
2916 /* XXXX This is too dramatic a measure... */
2917 PL_reg_maxiter = 0;
2918
2919 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2920 ReREFCNT_dec(re);
2921 regcpblow(cp);
2922 sayYES;
2923 }
0f5d15d6 2924 ReREFCNT_dec(re);
02db2b7b 2925 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2926 regcppop();
2927 PL_reg_call_cc = state.prev;
2928 PL_regcc = state.cc;
2929 PL_reg_re = state.re;
d3790889 2930 cache_re(PL_reg_re);
cb50f42d 2931 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2932
2933 /* XXXX This is too dramatic a measure... */
2934 PL_reg_maxiter = 0;
2935
8e514ae6 2936 logical = 0;
0f5d15d6
IZ
2937 sayNO;
2938 }
c277df42 2939 sw = SvTRUE(ret);
0f5d15d6 2940 logical = 0;
a0ed51b3
LW
2941 }
2942 else
3280af22 2943 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2944 break;
2945 }
a0d0e21e 2946 case OPEN:
c277df42 2947 n = ARG(scan); /* which paren pair */
3280af22
NIS
2948 PL_reg_start_tmp[n] = locinput;
2949 if (n > PL_regsize)
2950 PL_regsize = n;
a0d0e21e
LW
2951 break;
2952 case CLOSE:
c277df42 2953 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2954 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2955 PL_regendp[n] = locinput - PL_bostr;
eb160463 2956 if (n > (I32)*PL_reglastparen)
3280af22 2957 *PL_reglastparen = n;
a01268b5 2958 *PL_reglastcloseparen = n;
a0d0e21e 2959 break;
c277df42
IZ
2960 case GROUPP:
2961 n = ARG(scan); /* which paren pair */
eb160463 2962 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2963 break;
2964 case IFTHEN:
2c2d71f5 2965 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2966 if (sw)
2967 next = NEXTOPER(NEXTOPER(scan));
2968 else {
2969 next = scan + ARG(scan);
2970 if (OP(next) == IFTHEN) /* Fake one. */
2971 next = NEXTOPER(NEXTOPER(next));
2972 }
2973 break;
2974 case LOGICAL:
0f5d15d6 2975 logical = scan->flags;
c277df42 2976 break;
2ab05381
IZ
2977/*******************************************************************
2978 PL_regcc contains infoblock about the innermost (...)* loop, and
2979 a pointer to the next outer infoblock.
2980
2981 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2982
2983 1) After matching X, regnode for CURLYX is processed;
2984
9041c2e3 2985 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
2986 regmatch() recursively with the starting point at WHILEM node;
2987
2988 3) Each hit of WHILEM node tries to match A and Z (in the order
2989 depending on the current iteration, min/max of {min,max} and
2990 greediness). The information about where are nodes for "A"
2991 and "Z" is read from the infoblock, as is info on how many times "A"
2992 was already matched, and greediness.
2993
2994 4) After A matches, the same WHILEM node is hit again.
2995
2996 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2997 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2998 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2999 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3000 of the external loop.
3001
3002 Currently present infoblocks form a tree with a stem formed by PL_curcc
3003 and whatever it mentions via ->next, and additional attached trees
3004 corresponding to temporarily unset infoblocks as in "5" above.
3005
9041c2e3 3006 In the following picture infoblocks for outer loop of
2ab05381
IZ
3007 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3008 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3009 infoblocks are drawn below the "reset" infoblock.
3010
3011 In fact in the picture below we do not show failed matches for Z and T
3012 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3013 more obvious *why* one needs to *temporary* unset infoblocks.]
3014
3015 Matched REx position InfoBlocks Comment
3016 (Y(A)*?Z)*?T x
3017 Y(A)*?Z)*?T x <- O
3018 Y (A)*?Z)*?T x <- O
3019 Y A)*?Z)*?T x <- O <- I
3020 YA )*?Z)*?T x <- O <- I
3021 YA A)*?Z)*?T x <- O <- I
3022 YAA )*?Z)*?T x <- O <- I
3023 YAA Z)*?T x <- O # Temporary unset I
3024 I
3025
3026 YAAZ Y(A)*?Z)*?T x <- O
3027 I
3028
3029 YAAZY (A)*?Z)*?T x <- O
3030 I
3031
3032 YAAZY A)*?Z)*?T x <- O <- I
3033 I
3034
3035 YAAZYA )*?Z)*?T x <- O <- I
3036 I
3037
3038 YAAZYA Z)*?T x <- O # Temporary unset I
3039 I,I
3040
3041 YAAZYAZ )*?T x <- O
3042 I,I
3043
3044 YAAZYAZ T x # Temporary unset O
3045 O
3046 I,I
3047
3048 YAAZYAZT x
3049 O
3050 I,I
3051 *******************************************************************/
a0d0e21e
LW
3052 case CURLYX: {
3053 CURCUR cc;
3280af22 3054 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3055 /* No need to save/restore up to this paren */
3056 I32 parenfloor = scan->flags;
c277df42
IZ
3057
3058 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3059 next += ARG(next);
3280af22
NIS
3060 cc.oldcc = PL_regcc;
3061 PL_regcc = &cc;
cb434fcc
IZ
3062 /* XXXX Probably it is better to teach regpush to support
3063 parenfloor > PL_regsize... */
eb160463 3064 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3065 parenfloor = *PL_reglastparen; /* Pessimization... */
3066 cc.parenfloor = parenfloor;
a0d0e21e
LW
3067 cc.cur = -1;
3068 cc.min = ARG1(scan);
3069 cc.max = ARG2(scan);
c277df42 3070 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3071 cc.next = next;
3072 cc.minmod = minmod;
3073 cc.lastloc = 0;
3280af22 3074 PL_reginput = locinput;
a0d0e21e
LW
3075 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3076 regcpblow(cp);
3280af22 3077 PL_regcc = cc.oldcc;
4633a7c4 3078 saySAME(n);
a0d0e21e
LW
3079 }
3080 /* NOT REACHED */
3081 case WHILEM: {
3082 /*
3083 * This is really hard to understand, because after we match
3084 * what we're trying to match, we must make sure the rest of
2c2d71f5 3085 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3086 * to go back UP the parse tree by recursing ever deeper. And
3087 * if it fails, we have to reset our parent's current state
3088 * that we can try again after backing off.
3089 */
3090
c277df42 3091 CHECKPOINT cp, lastcp;
3280af22 3092 CURCUR* cc = PL_regcc;
c277df42
IZ
3093 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3094
4633a7c4 3095 n = cc->cur + 1; /* how many we know we matched */
3280af22 3096 PL_reginput = locinput;
a0d0e21e 3097
c277df42 3098 DEBUG_r(
9041c2e3 3099 PerlIO_printf(Perl_debug_log,
91f3b821 3100 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3101 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3102 (long)n, (long)cc->min,
2797576d 3103 (long)cc->max, PTR2UV(cc))
c277df42 3104 );
4633a7c4 3105
a0d0e21e
LW
3106 /* If degenerate scan matches "", assume scan done. */
3107
579cf2c3 3108 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3109 PL_regcc = cc->oldcc;
2ab05381
IZ
3110 if (PL_regcc)
3111 ln = PL_regcc->cur;
c277df42 3112 DEBUG_r(
c3464db5
DD
3113 PerlIO_printf(Perl_debug_log,
3114 "%*s empty match detected, try continuation...\n",
3280af22 3115 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3116 );
a0d0e21e 3117 if (regmatch(cc->next))
4633a7c4 3118 sayYES;
2ab05381
IZ
3119 if (PL_regcc)
3120 PL_regcc->cur = ln;
3280af22 3121 PL_regcc = cc;
4633a7c4 3122 sayNO;
a0d0e21e
LW
3123 }
3124
3125 /* First just match a string of min scans. */
3126
3127 if (n < cc->min) {
3128 cc->cur = n;
3129 cc->lastloc = locinput;
4633a7c4
LW
3130 if (regmatch(cc->scan))
3131 sayYES;
3132 cc->cur = n - 1;
c277df42 3133 cc->lastloc = lastloc;
4633a7c4 3134 sayNO;
a0d0e21e
LW
3135 }
3136
2c2d71f5
JH
3137 if (scan->flags) {
3138 /* Check whether we already were at this position.
3139 Postpone detection until we know the match is not
3140 *that* much linear. */
3141 if (!PL_reg_maxiter) {
3142 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3143 PL_reg_leftiter = PL_reg_maxiter;
3144 }
3145 if (PL_reg_leftiter-- == 0) {
3146 I32 size = (PL_reg_maxiter + 7)/8;
3147 if (PL_reg_poscache) {
eb160463 3148 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3149 Renew(PL_reg_poscache, size, char);
3150 PL_reg_poscache_size = size;
3151 }
3152 Zero(PL_reg_poscache, size, char);
3153 }
3154 else {
3155 PL_reg_poscache_size = size;
3156 Newz(29, PL_reg_poscache, size, char);
3157 }
3158 DEBUG_r(
3159 PerlIO_printf(Perl_debug_log,
3160 "%sDetected a super-linear match, switching on caching%s...\n",
3161 PL_colors[4], PL_colors[5])
3162 );
3163 }
3164 if (PL_reg_leftiter < 0) {
3165 I32 o = locinput - PL_bostr, b;
3166
3167 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3168 b = o % 8;
3169 o /= 8;
3170 if (PL_reg_poscache[o] & (1<<b)) {
3171 DEBUG_r(
3172 PerlIO_printf(Perl_debug_log,
3173 "%*s already tried at this position...\n",
3174 REPORT_CODE_OFF+PL_regindent*2, "")
3175 );
7821416a 3176 sayNO_SILENT;
2c2d71f5
JH
3177 }
3178 PL_reg_poscache[o] |= (1<<b);
3179 }
3180 }
3181
a0d0e21e
LW
3182 /* Prefer next over scan for minimal matching. */
3183
3184 if (cc->minmod) {
3280af22 3185 PL_regcc = cc->oldcc;
2ab05381
IZ
3186 if (PL_regcc)
3187 ln = PL_regcc->cur;
5f05dabc 3188 cp = regcppush(cc->parenfloor);
02db2b7b 3189 REGCP_SET(lastcp);
5f05dabc 3190 if (regmatch(cc->next)) {
c277df42 3191 regcpblow(cp);
4633a7c4 3192 sayYES; /* All done. */
5f05dabc 3193 }
02db2b7b 3194 REGCP_UNWIND(lastcp);
5f05dabc 3195 regcppop();
2ab05381
IZ
3196 if (PL_regcc)
3197 PL_regcc->cur = ln;
3280af22 3198 PL_regcc = cc;
a0d0e21e 3199
c277df42 3200 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3201 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3202 && !(PL_reg_flags & RF_warned)) {
3203 PL_reg_flags |= RF_warned;
9014280d 3204 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3205 "Complex regular subexpression recursion",
3206 REG_INFTY - 1);
c277df42 3207 }
4633a7c4 3208 sayNO;
c277df42 3209 }
a687059c 3210
c277df42 3211 DEBUG_r(
c3464db5
DD
3212 PerlIO_printf(Perl_debug_log,
3213 "%*s trying longer...\n",
3280af22 3214 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3215 );
a0d0e21e 3216 /* Try scanning more and see if it helps. */
3280af22 3217 PL_reginput = locinput;
a0d0e21e
LW
3218 cc->cur = n;
3219 cc->lastloc = locinput;
5f05dabc 3220 cp = regcppush(cc->parenfloor);
02db2b7b 3221 REGCP_SET(lastcp);
5f05dabc 3222 if (regmatch(cc->scan)) {
c277df42 3223 regcpblow(cp);
4633a7c4 3224 sayYES;
5f05dabc 3225 }
02db2b7b 3226 REGCP_UNWIND(lastcp);
5f05dabc 3227 regcppop();
4633a7c4 3228 cc->cur = n - 1;
c277df42 3229 cc->lastloc = lastloc;
4633a7c4 3230 sayNO;
a0d0e21e
LW
3231 }
3232
3233 /* Prefer scan over next for maximal matching. */
3234
3235 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3236 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3237 cc->cur = n;
3238 cc->lastloc = locinput;
02db2b7b 3239 REGCP_SET(lastcp);
5f05dabc 3240 if (regmatch(cc->scan)) {
c277df42 3241 regcpblow(cp);
4633a7c4 3242 sayYES;
5f05dabc 3243 }
02db2b7b 3244 REGCP_UNWIND(lastcp);
a0d0e21e 3245 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3246 PL_reginput = locinput;
c277df42 3247 DEBUG_r(
c3464db5
DD
3248 PerlIO_printf(Perl_debug_log,
3249 "%*s failed, try continuation...\n",
3280af22 3250 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3251 );
3252 }
9041c2e3 3253 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3254 && !(PL_reg_flags & RF_warned)) {
3280af22 3255 PL_reg_flags |= RF_warned;
9014280d 3256 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3257 "Complex regular subexpression recursion",
3258 REG_INFTY - 1);
a0d0e21e
LW
3259 }
3260
3261 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3262 PL_regcc = cc->oldcc;
2ab05381
IZ
3263 if (PL_regcc)
3264 ln = PL_regcc->cur;
a0d0e21e 3265 if (regmatch(cc->next))
4633a7c4 3266 sayYES;
2ab05381
IZ
3267 if (PL_regcc)
3268 PL_regcc->cur = ln;
3280af22 3269 PL_regcc = cc;
4633a7c4 3270 cc->cur = n - 1;
c277df42 3271 cc->lastloc = lastloc;
4633a7c4 3272 sayNO;
a0d0e21e
LW
3273 }
3274 /* NOT REACHED */
9041c2e3 3275 case BRANCHJ:
c277df42
IZ
3276 next = scan + ARG(scan);
3277 if (next == scan)
3278 next = NULL;
3279 inner = NEXTOPER(NEXTOPER(scan));
3280 goto do_branch;
9041c2e3 3281 case BRANCH:
c277df42
IZ
3282 inner = NEXTOPER(scan);
3283 do_branch:
3284 {
c277df42
IZ
3285 c1 = OP(scan);
3286 if (OP(next) != c1) /* No choice. */
3287 next = inner; /* Avoid recursion. */
a0d0e21e 3288 else {
02db2b7b
IZ
3289 I32 lastparen = *PL_reglastparen;
3290 I32 unwind1;
3291 re_unwind_branch_t *uw;
3292
3293 /* Put unwinding data on stack */
3294 unwind1 = SSNEWt(1,re_unwind_branch_t);
3295 uw = SSPTRt(unwind1,re_unwind_branch_t);
3296 uw->prev = unwind;
3297 unwind = unwind1;
3298 uw->type = ((c1 == BRANCH)
3299 ? RE_UNWIND_BRANCH
3300 : RE_UNWIND_BRANCHJ);
3301 uw->lastparen = lastparen;
3302 uw->next = next;
3303 uw->locinput = locinput;
3304 uw->nextchr = nextchr;
3305#ifdef DEBUGGING
3306 uw->regindent = ++PL_regindent;
3307#endif
c277df42 3308
02db2b7b
IZ
3309 REGCP_SET(uw->lastcp);
3310
3311 /* Now go into the first branch */
3312 next = inner;
a687059c 3313 }
a0d0e21e
LW
3314 }
3315 break;
3316 case MINMOD:
3317 minmod = 1;
3318 break;
c277df42
IZ
3319 case CURLYM:
3320 {
00db4c45 3321 I32 l = 0;
c277df42 3322 CHECKPOINT lastcp;
9041c2e3 3323
c277df42
IZ
3324 /* We suppose that the next guy does not need
3325 backtracking: in particular, it is of constant length,
3326 and has no parenths to influence future backrefs. */
3327 ln = ARG1(scan); /* min to match */
3328 n = ARG2(scan); /* max to match */
c277df42
IZ
3329 paren = scan->flags;
3330 if (paren) {
3280af22
NIS
3331 if (paren > PL_regsize)
3332 PL_regsize = paren;
eb160463 3333 if (paren > (I32)*PL_reglastparen)
3280af22 3334 *PL_reglastparen = paren;
c277df42 3335 }
dc45a647 3336 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3337 if (paren)
3338 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3339 PL_reginput = locinput;
c277df42
IZ
3340 if (minmod) {
3341 minmod = 0;
3342 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3343 sayNO;
f31a99c8
HS
3344 /* if we matched something zero-length we don't need to
3345 backtrack - capturing parens are already defined, so
3346 the caveat in the maximal case doesn't apply
3347
3348 XXXX if ln == 0, we can redo this check first time
3349 through the following loop
3350 */
3351 if (ln && l == 0)
3352 n = ln; /* don't backtrack */
3280af22 3353 locinput = PL_reginput;
cca55fe3 3354 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3355 regnode *text_node = next;
3356
cca55fe3 3357 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3358
cca55fe3 3359 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3360 else {
cca55fe3
JP
3361 if (PL_regkind[(U8)OP(text_node)] == REF) {
3362 I32 n, ln;
3363 n = ARG(text_node); /* which paren pair */
3364 ln = PL_regstartp[n];
3365 /* assume yes if we haven't seen CLOSEn */
3366 if (
eb160463 3367 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3368 ln == -1 ||
3369 ln == PL_regendp[n]
3370 ) {
3371 c1 = c2 = -1000;
3372 goto assume_ok_MM;
3373 }
3374 c1 = *(PL_bostr + ln);
3375 }
3376 else { c1 = (U8)*STRING(text_node); }
af5decee 3377 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3378 c2 = PL_fold[c1];
af5decee 3379 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3380 c2 = PL_fold_locale[c1];
3381 else
3382 c2 = c1;
3383 }
a0ed51b3
LW
3384 }
3385 else
c277df42 3386 c1 = c2 = -1000;
cca55fe3 3387 assume_ok_MM:
02db2b7b 3388 REGCP_SET(lastcp);
5f4b28b2 3389 /* This may be improved if l == 0. */
c277df42
IZ
3390 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3391 /* If it could work, try it. */
3392 if (c1 == -1000 ||
3280af22
NIS
3393 UCHARAT(PL_reginput) == c1 ||
3394 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3395 {
3396 if (paren) {
f31a99c8 3397 if (ln) {
cf93c79d
IZ
3398 PL_regstartp[paren] =
3399 HOPc(PL_reginput, -l) - PL_bostr;
3400 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3401 }
3402 else
cf93c79d 3403 PL_regendp[paren] = -1;
c277df42
IZ
3404 }
3405 if (regmatch(next))
3406 sayYES;
02db2b7b 3407 REGCP_UNWIND(lastcp);
c277df42
IZ
3408 }
3409 /* Couldn't or didn't -- move forward. */
3280af22 3410 PL_reginput = locinput;
c277df42
IZ
3411 if (regrepeat_hard(scan, 1, &l)) {
3412 ln++;
3280af22 3413 locinput = PL_reginput;
c277df42
IZ
3414 }
3415 else
3416 sayNO;
3417 }
a0ed51b3
LW
3418 }
3419 else {
c277df42 3420 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3421 /* if we matched something zero-length we don't need to
3422 backtrack, unless the minimum count is zero and we
3423 are capturing the result - in that case the capture
3424 being defined or not may affect later execution
3425 */
3426 if (n != 0 && l == 0 && !(paren && ln == 0))
3427 ln = n; /* don't backtrack */
3280af22 3428 locinput = PL_reginput;
c277df42 3429 DEBUG_r(
5c0ca799 3430 PerlIO_printf(Perl_debug_log,
faccc32b 3431 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3432 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3433 (IV) n, (IV)l)
c277df42
IZ
3434 );
3435 if (n >= ln) {
cca55fe3 3436 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3437 regnode *text_node = next;
3438
cca55fe3 3439 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3440
cca55fe3 3441 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3442 else {
cca55fe3
JP
3443 if (PL_regkind[(U8)OP(text_node)] == REF) {
3444 I32 n, ln;
3445 n = ARG(text_node); /* which paren pair */
3446 ln = PL_regstartp[n];
3447 /* assume yes if we haven't seen CLOSEn */
3448 if (
eb160463 3449 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3450 ln == -1 ||
3451 ln == PL_regendp[n]
3452 ) {
3453 c1 = c2 = -1000;
3454 goto assume_ok_REG;
3455 }
3456 c1 = *(PL_bostr + ln);
3457 }
3458 else { c1 = (U8)*STRING(text_node); }
3459
af5decee 3460 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3461 c2 = PL_fold[c1];
af5decee 3462 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3463 c2 = PL_fold_locale[c1];
3464 else
3465 c2 = c1;
3466 }
a0ed51b3
LW
3467 }
3468 else
c277df42
IZ
3469 c1 = c2 = -1000;
3470 }
cca55fe3 3471 assume_ok_REG:
02db2b7b 3472 REGCP_SET(lastcp);
c277df42
IZ
3473 while (n >= ln) {
3474 /* If it could work, try it. */
3475 if (c1 == -1000 ||
3280af22
NIS
3476 UCHARAT(PL_reginput) == c1 ||
3477 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3478 {
3479 DEBUG_r(
c3464db5 3480 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3481 "%*s trying tail with n=%"IVdf"...\n",
3482 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3483 );
3484 if (paren) {
3485 if (n) {
cf93c79d
IZ
3486 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3487 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3488 }
a0ed51b3 3489 else
cf93c79d 3490 PL_regendp[paren] = -1;
c277df42 3491 }
a0ed51b3
LW
3492 if (regmatch(next))
3493 sayYES;
02db2b7b 3494 REGCP_UNWIND(lastcp);
a0ed51b3 3495 }
c277df42
IZ
3496 /* Couldn't or didn't -- back up. */
3497 n--;
dfe13c55 3498 locinput = HOPc(locinput, -l);
3280af22 3499 PL_reginput = locinput;
c277df42
IZ
3500 }
3501 }
3502 sayNO;
3503 break;
3504 }
3505 case CURLYN:
3506 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3507 if (paren > PL_regsize)
3508 PL_regsize = paren;
eb160463 3509 if (paren > (I32)*PL_reglastparen)
3280af22 3510 *PL_reglastparen = paren;
c277df42
IZ
3511 ln = ARG1(scan); /* min to match */
3512 n = ARG2(scan); /* max to match */
dc45a647 3513 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3514 goto repeat;
a0d0e21e 3515 case CURLY:
c277df42 3516 paren = 0;
a0d0e21e
LW
3517 ln = ARG1(scan); /* min to match */
3518 n = ARG2(scan); /* max to match */
dc45a647 3519 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3520 goto repeat;
3521 case STAR:
3522 ln = 0;
c277df42 3523 n = REG_INFTY;
a0d0e21e 3524 scan = NEXTOPER(scan);
c277df42 3525 paren = 0;
a0d0e21e
LW
3526 goto repeat;
3527 case PLUS:
c277df42
IZ
3528 ln = 1;
3529 n = REG_INFTY;
3530 scan = NEXTOPER(scan);
3531 paren = 0;
3532 repeat:
a0d0e21e
LW
3533 /*
3534 * Lookahead to avoid useless match attempts
3535 * when we know what character comes next.
3536 */
5f80c4cf
JP
3537
3538 /*
3539 * Used to only do .*x and .*?x, but now it allows
3540 * for )'s, ('s and (?{ ... })'s to be in the way
3541 * of the quantifier and the EXACT-like node. -- japhy
3542 */
3543
cca55fe3 3544 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3545 U8 *s;
3546 regnode *text_node = next;
3547
cca55fe3 3548 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3549
cca55fe3 3550 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3551 else {
cca55fe3
JP
3552 if (PL_regkind[(U8)OP(text_node)] == REF) {
3553 I32 n, ln;
3554 n = ARG(text_node); /* which paren pair */
3555 ln = PL_regstartp[n];
3556 /* assume yes if we haven't seen CLOSEn */
3557 if (
eb160463 3558 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3559 ln == -1 ||
3560 ln == PL_regendp[n]
3561 ) {
3562 c1 = c2 = -1000;
3563 goto assume_ok_easy;
3564 }
9246c65e 3565 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3566 }
3567 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3568
3569 if (!UTF) {
3570 c2 = c1 = *s;
f65d3ee7 3571 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3572 c2 = PL_fold[c1];
f65d3ee7 3573 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3574 c2 = PL_fold_locale[c1];
1aa99e6b 3575 }
5f80c4cf 3576 else { /* UTF */
f65d3ee7 3577 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3578 STRLEN ulen1, ulen2;
e7ae6809
JH
3579 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3580 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3581
3582 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3583 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3584
872c91ae
JH
3585 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3586 ckWARN(WARN_UTF8) ?
3587 0 : UTF8_ALLOW_ANY);
3588 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3589 ckWARN(WARN_UTF8) ?
3590 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3591 }
3592 else {
872c91ae
JH
3593 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3594 ckWARN(WARN_UTF8) ?
3595 0 : UTF8_ALLOW_ANY);
5f80c4cf 3596 }
1aa99e6b
IH
3597 }
3598 }
bbce6d69 3599 }
a0d0e21e 3600 else
bbce6d69 3601 c1 = c2 = -1000;
cca55fe3 3602 assume_ok_easy:
3280af22 3603 PL_reginput = locinput;
a0d0e21e 3604 if (minmod) {
c277df42 3605 CHECKPOINT lastcp;
a0d0e21e
LW
3606 minmod = 0;
3607 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3608 sayNO;
a0ed51b3 3609 locinput = PL_reginput;
02db2b7b 3610 REGCP_SET(lastcp);
0fe9bf95 3611 if (c1 != -1000) {
1aa99e6b 3612 char *e; /* Should not check after this */
0fe9bf95 3613 char *old = locinput;
b2f2f093 3614 int count = 0;
0fe9bf95 3615
1aa99e6b 3616 if (n == REG_INFTY) {
0fe9bf95 3617 e = PL_regeol - 1;
1aa99e6b
IH
3618 if (do_utf8)
3619 while (UTF8_IS_CONTINUATION(*(U8*)e))
3620 e--;
3621 }
3622 else if (do_utf8) {
3623 int m = n - ln;
3624 for (e = locinput;
3625 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3626 e += UTF8SKIP(e);
3627 }
3628 else {
3629 e = locinput + n - ln;
3630 if (e >= PL_regeol)
3631 e = PL_regeol - 1;
3632 }
0fe9bf95
IZ
3633 while (1) {
3634 /* Find place 'next' could work */
1aa99e6b
IH
3635 if (!do_utf8) {
3636 if (c1 == c2) {
a8e8ab15
JH
3637 while (locinput <= e &&
3638 UCHARAT(locinput) != c1)
1aa99e6b
IH
3639 locinput++;
3640 } else {
9041c2e3 3641 while (locinput <= e
a8e8ab15
JH
3642 && UCHARAT(locinput) != c1
3643 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3644 locinput++;
3645 }
3646 count = locinput - old;
3647 }
3648 else {
3649 STRLEN len;
3650 if (c1 == c2) {
872c91ae
JH
3651 /* count initialised to
3652 * utf8_distance(old, locinput) */
b2f2f093 3653 while (locinput <= e &&
872c91ae
JH
3654 utf8n_to_uvchr((U8*)locinput,
3655 UTF8_MAXLEN, &len,
3656 ckWARN(WARN_UTF8) ?
eb160463 3657 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3658 locinput += len;
b2f2f093
JH
3659 count++;
3660 }
1aa99e6b 3661 } else {
872c91ae
JH
3662 /* count initialised to
3663 * utf8_distance(old, locinput) */
b2f2f093 3664 while (locinput <= e) {
872c91ae
JH
3665 UV c = utf8n_to_uvchr((U8*)locinput,
3666 UTF8_MAXLEN, &len,
3667 ckWARN(WARN_UTF8) ?
3668 0 : UTF8_ALLOW_ANY);
eb160463 3669 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3670 break;
b2f2f093
JH
3671 locinput += len;
3672 count++;
1aa99e6b
IH
3673 }
3674 }
0fe9bf95 3675 }
9041c2e3 3676 if (locinput > e)
0fe9bf95
IZ
3677 sayNO;
3678 /* PL_reginput == old now */
3679 if (locinput != old) {
3680 ln = 1; /* Did some */
1aa99e6b 3681 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3682 sayNO;
3683 }
3684 /* PL_reginput == locinput now */
29d1e993 3685 TRYPAREN(paren, ln, locinput);
0fe9bf95 3686 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3687 REGCP_UNWIND(lastcp);
0fe9bf95 3688 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3689 old = locinput;
3690 if (do_utf8)
3691 locinput += UTF8SKIP(locinput);
3692 else
3693 locinput++;
b2f2f093 3694 count = 1;
0fe9bf95
IZ
3695 }
3696 }
3697 else
c277df42 3698 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3699 UV c;
3700 if (c1 != -1000) {
3701 if (do_utf8)
872c91ae
JH
3702 c = utf8n_to_uvchr((U8*)PL_reginput,
3703 UTF8_MAXLEN, 0,
3704 ckWARN(WARN_UTF8) ?
3705 0 : UTF8_ALLOW_ANY);
1aa99e6b 3706 else
9041c2e3 3707 c = UCHARAT(PL_reginput);
2390ecbc 3708 /* If it could work, try it. */
eb160463 3709 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3710 {
ecc99935 3711 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3712 REGCP_UNWIND(lastcp);
3713 }
1aa99e6b 3714 }
a0d0e21e 3715 /* If it could work, try it. */
2390ecbc 3716 else if (c1 == -1000)
bbce6d69 3717 {
ecc99935 3718 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3719 REGCP_UNWIND(lastcp);
bbce6d69 3720 }
c277df42 3721 /* Couldn't or didn't -- move forward. */
a0ed51b3 3722 PL_reginput = locinput;
a0d0e21e
LW
3723 if (regrepeat(scan, 1)) {
3724 ln++;
a0ed51b3
LW
3725 locinput = PL_reginput;
3726 }
3727 else
4633a7c4 3728 sayNO;
a0d0e21e
LW
3729 }
3730 }
3731 else {
c277df42 3732 CHECKPOINT lastcp;
a0d0e21e 3733 n = regrepeat(scan, n);
a0ed51b3 3734 locinput = PL_reginput;
22c35a8c 3735 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3736 ((!PL_multiline && OP(next) != MEOL) ||
3737 OP(next) == SEOL || OP(next) == EOS))
3738 {
a0d0e21e 3739 ln = n; /* why back off? */
1aeab75a
GS
3740 /* ...because $ and \Z can match before *and* after
3741 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3742 We should back off by one in this case. */
3743 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3744 ln--;
3745 }
02db2b7b 3746 REGCP_SET(lastcp);
c277df42 3747 if (paren) {
8fa7f367 3748 UV c = 0;
c277df42 3749 while (n >= ln) {
1aa99e6b
IH
3750 if (c1 != -1000) {
3751 if (do_utf8)
872c91ae
JH
3752 c = utf8n_to_uvchr((U8*)PL_reginput,
3753 UTF8_MAXLEN, 0,
3754 ckWARN(WARN_UTF8) ?
3755 0 : UTF8_ALLOW_ANY);
1aa99e6b 3756 else
9041c2e3 3757 c = UCHARAT(PL_reginput);
1aa99e6b 3758 }
c277df42 3759 /* If it could work, try it. */
eb160463 3760 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3761 {
29d1e993 3762 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3763 REGCP_UNWIND(lastcp);
c277df42
IZ
3764 }
3765 /* Couldn't or didn't -- back up. */
3766 n--;
dfe13c55 3767 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3768 }
a0ed51b3
LW
3769 }
3770 else {
8fa7f367 3771 UV c = 0;
c277df42 3772 while (n >= ln) {
1aa99e6b
IH
3773 if (c1 != -1000) {
3774 if (do_utf8)
872c91ae
JH
3775 c = utf8n_to_uvchr((U8*)PL_reginput,
3776 UTF8_MAXLEN, 0,
3777 ckWARN(WARN_UTF8) ?
3778 0 : UTF8_ALLOW_ANY);
1aa99e6b 3779 else
9041c2e3 3780 c = UCHARAT(PL_reginput);
1aa99e6b 3781 }
c277df42 3782 /* If it could work, try it. */
eb160463 3783 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3784 {
29d1e993 3785 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3786 REGCP_UNWIND(lastcp);
c277df42
IZ
3787 }
3788 /* Couldn't or didn't -- back up. */
3789 n--;
dfe13c55 3790 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3791 }
a0d0e21e
LW
3792 }
3793 }
4633a7c4 3794 sayNO;
c277df42 3795 break;
a0d0e21e 3796 case END:
0f5d15d6
IZ
3797 if (PL_reg_call_cc) {
3798 re_cc_state *cur_call_cc = PL_reg_call_cc;
3799 CURCUR *cctmp = PL_regcc;
3800 regexp *re = PL_reg_re;
3801 CHECKPOINT cp, lastcp;
3802
3803 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3804 REGCP_SET(lastcp);
0f5d15d6
IZ
3805 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3806 the caller. */
3807 PL_reginput = locinput; /* Make position available to
3808 the callcc. */
3809 cache_re(PL_reg_call_cc->re);
3810 PL_regcc = PL_reg_call_cc->cc;
3811 PL_reg_call_cc = PL_reg_call_cc->prev;
3812 if (regmatch(cur_call_cc->node)) {
3813 PL_reg_call_cc = cur_call_cc;
3814 regcpblow(cp);
3815 sayYES;
3816 }
02db2b7b 3817 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3818 regcppop();
3819 PL_reg_call_cc = cur_call_cc;
3820 PL_regcc = cctmp;
3821 PL_reg_re = re;
3822 cache_re(re);
3823
3824 DEBUG_r(
3825 PerlIO_printf(Perl_debug_log,
3826 "%*s continuation failed...\n",
3827 REPORT_CODE_OFF+PL_regindent*2, "")
3828 );
7821416a 3829 sayNO_SILENT;
0f5d15d6 3830 }
7821416a
IZ
3831 if (locinput < PL_regtill) {
3832 DEBUG_r(PerlIO_printf(Perl_debug_log,
3833 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3834 PL_colors[4],
3835 (long)(locinput - PL_reg_starttry),
3836 (long)(PL_regtill - PL_reg_starttry),
3837 PL_colors[5]));
3838 sayNO_FINAL; /* Cannot match: too short. */
3839 }
3840 PL_reginput = locinput; /* put where regtry can find it */
3841 sayYES_FINAL; /* Success! */
7e5428c5 3842 case SUCCEED:
3280af22 3843 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3844 sayYES_LOUD; /* Success! */
c277df42
IZ
3845 case SUSPEND:
3846 n = 1;
9fe1d20c 3847 PL_reginput = locinput;
9041c2e3 3848 goto do_ifmatch;
a0d0e21e 3849 case UNLESSM:
c277df42 3850 n = 0;
a0ed51b3 3851 if (scan->flags) {
efb30f32
HS
3852 s = HOPBACKc(locinput, scan->flags);
3853 if (!s)
3854 goto say_yes;
3855 PL_reginput = s;
a0ed51b3
LW
3856 }
3857 else
3858 PL_reginput = locinput;
c277df42
IZ
3859 goto do_ifmatch;
3860 case IFMATCH:
3861 n = 1;
a0ed51b3 3862 if (scan->flags) {
efb30f32
HS
3863 s = HOPBACKc(locinput, scan->flags);
3864 if (!s)
3865 goto say_no;
3866 PL_reginput = s;
a0ed51b3
LW
3867 }
3868 else
3869 PL_reginput = locinput;
3870
c277df42 3871 do_ifmatch:
c277df42
IZ
3872 inner = NEXTOPER(NEXTOPER(scan));
3873 if (regmatch(inner) != n) {
3874 say_no:
3875 if (logical) {
3876 logical = 0;
3877 sw = 0;
3878 goto do_longjump;
a0ed51b3
LW
3879 }
3880 else
c277df42
IZ
3881 sayNO;
3882 }
3883 say_yes:
3884 if (logical) {
3885 logical = 0;
3886 sw = 1;
3887 }
fe44a5e8 3888 if (OP(scan) == SUSPEND) {
3280af22 3889 locinput = PL_reginput;
565764a8 3890 nextchr = UCHARAT(locinput);
fe44a5e8 3891 }
c277df42
IZ
3892 /* FALL THROUGH. */
3893 case LONGJMP:
3894 do_longjump:
3895 next = scan + ARG(scan);
3896 if (next == scan)
3897 next = NULL;
a0d0e21e
LW
3898 break;
3899 default:
b900a521 3900 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3901 PTR2UV(scan), OP(scan));
cea2e8a9 3902 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3903 }
02db2b7b 3904 reenter:
a0d0e21e
LW
3905 scan = next;
3906 }
a687059c 3907
a0d0e21e
LW
3908 /*
3909 * We get here only if there's trouble -- normally "case END" is
3910 * the terminating point.
3911 */
cea2e8a9 3912 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3913 /*NOTREACHED*/
4633a7c4
LW
3914 sayNO;
3915
7821416a
IZ
3916yes_loud:
3917 DEBUG_r(
3918 PerlIO_printf(Perl_debug_log,
3919 "%*s %scould match...%s\n",
3920 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3921 );
3922 goto yes;
3923yes_final:
3924 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3925 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3926yes:
3927#ifdef DEBUGGING
3280af22 3928 PL_regindent--;
4633a7c4 3929#endif
02db2b7b
IZ
3930
3931#if 0 /* Breaks $^R */
3932 if (unwind)
3933 regcpblow(firstcp);
3934#endif
4633a7c4
LW
3935 return 1;
3936
3937no:
7821416a
IZ
3938 DEBUG_r(
3939 PerlIO_printf(Perl_debug_log,
3940 "%*s %sfailed...%s\n",
3941 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3942 );
3943 goto do_no;
3944no_final:
3945do_no:
02db2b7b
IZ
3946 if (unwind) {
3947 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3948
3949 switch (uw->type) {
3950 case RE_UNWIND_BRANCH:
3951 case RE_UNWIND_BRANCHJ:
3952 {
3953 re_unwind_branch_t *uwb = &(uw->branch);
3954 I32 lastparen = uwb->lastparen;
9041c2e3 3955
02db2b7b
IZ
3956 REGCP_UNWIND(uwb->lastcp);
3957 for (n = *PL_reglastparen; n > lastparen; n--)
3958 PL_regendp[n] = -1;
3959 *PL_reglastparen = n;
3960 scan = next = uwb->next;
9041c2e3
NIS
3961 if ( !scan ||
3962 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3963 ? BRANCH : BRANCHJ) ) { /* Failure */
3964 unwind = uwb->prev;
3965#ifdef DEBUGGING
3966 PL_regindent--;
3967#endif
3968 goto do_no;
3969 }
3970 /* Have more choice yet. Reuse the same uwb. */
3971 /*SUPPRESS 560*/
3972 if ((n = (uwb->type == RE_UNWIND_BRANCH
3973 ? NEXT_OFF(next) : ARG(next))))
3974 next += n;
3975 else
3976 next = NULL; /* XXXX Needn't unwinding in this case... */
3977 uwb->next = next;
3978 next = NEXTOPER(scan);
3979 if (uwb->type == RE_UNWIND_BRANCHJ)
3980 next = NEXTOPER(next);
3981 locinput = uwb->locinput;
3982 nextchr = uwb->nextchr;
3983#ifdef DEBUGGING
3984 PL_regindent = uwb->regindent;
3985#endif
3986
3987 goto reenter;
3988 }
3989 /* NOT REACHED */
3990 default:
3991 Perl_croak(aTHX_ "regexp unwind memory corruption");
3992 }
3993 /* NOT REACHED */
3994 }
4633a7c4 3995#ifdef DEBUGGING
3280af22 3996 PL_regindent--;
4633a7c4 3997#endif
a0d0e21e 3998 return 0;
a687059c
LW
3999}
4000
4001/*
4002 - regrepeat - repeatedly match something simple, report how many
4003 */
4004/*
4005 * [This routine now assumes that it will only match on things of length 1.
4006 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4007 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4008 */
76e3520e 4009STATIC I32
cea2e8a9 4010S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4011{
a0d0e21e 4012 register char *scan;
a0d0e21e 4013 register I32 c;
3280af22 4014 register char *loceol = PL_regeol;
a0ed51b3 4015 register I32 hardcount = 0;
53c4c00c 4016 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4017
3280af22 4018 scan = PL_reginput;
faf11cac
HS
4019 if (max == REG_INFTY)
4020 max = I32_MAX;
4021 else if (max < loceol - scan)
a0d0e21e 4022 loceol = scan + max;
a0d0e21e 4023 switch (OP(p)) {
22c35a8c 4024 case REG_ANY:
1aa99e6b 4025 if (do_utf8) {
ffc61ed2 4026 loceol = PL_regeol;
1aa99e6b 4027 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4028 scan += UTF8SKIP(scan);
4029 hardcount++;
4030 }
4031 } else {
4032 while (scan < loceol && *scan != '\n')
4033 scan++;
a0ed51b3
LW
4034 }
4035 break;
ffc61ed2 4036 case SANY:
def8e4ea
JH
4037 if (do_utf8) {
4038 loceol = PL_regeol;
a0804c9e 4039 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4040 scan += UTF8SKIP(scan);
4041 hardcount++;
4042 }
4043 }
4044 else
4045 scan = loceol;
a0ed51b3 4046 break;
f33976b4
DB
4047 case CANY:
4048 scan = loceol;
4049 break;
bbce6d69 4050 case EXACT: /* length of string is 1 */
cd439c50 4051 c = (U8)*STRING(p);
bbce6d69 4052 while (scan < loceol && UCHARAT(scan) == c)
4053 scan++;
4054 break;
4055 case EXACTF: /* length of string is 1 */
cd439c50 4056 c = (U8)*STRING(p);
bbce6d69 4057 while (scan < loceol &&
22c35a8c 4058 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4059 scan++;
4060 break;
4061 case EXACTFL: /* length of string is 1 */
3280af22 4062 PL_reg_flags |= RF_tainted;
cd439c50 4063 c = (U8)*STRING(p);
bbce6d69 4064 while (scan < loceol &&
22c35a8c 4065 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4066 scan++;
4067 break;
4068 case ANYOF:
ffc61ed2
JH
4069 if (do_utf8) {
4070 loceol = PL_regeol;
388cc4de
HS
4071 while (hardcount < max && scan < loceol) {
4072 bool cont = FALSE;
4073 if (ANYOF_FLAGS(p) & ANYOF_UNICODE) {
4074 if (reginclass(p, (U8*)scan, 0, do_utf8))
4075 cont = TRUE;
4076 }
4077 else {
4078 U8 c = (U8)scan[0];
4079
4080 if (UTF8_IS_INVARIANT(c)) {
4081 if (ANYOF_BITMAP_TEST(p, c))
4082 cont = TRUE;
4083 }
4084 else {
4085 if (reginclass(p, (U8*)scan, 0, do_utf8))
4086 cont = TRUE;
4087 }
4088 }
4089 if (!cont)
4090 break;
ffc61ed2
JH
4091 scan += UTF8SKIP(scan);
4092 hardcount++;
4093 }
4094 } else {
7d3e948e 4095 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4096 scan++;
4097 }
a0d0e21e
LW
4098 break;
4099 case ALNUM:
1aa99e6b 4100 if (do_utf8) {
ffc61ed2 4101 loceol = PL_regeol;
8269fa76 4102 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4103 while (hardcount < max && scan < loceol &&
3568d838 4104 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4105 scan += UTF8SKIP(scan);
4106 hardcount++;
4107 }
4108 } else {
4109 while (scan < loceol && isALNUM(*scan))
4110 scan++;
a0ed51b3
LW
4111 }
4112 break;
bbce6d69 4113 case ALNUML:
3280af22 4114 PL_reg_flags |= RF_tainted;
1aa99e6b 4115 if (do_utf8) {
ffc61ed2 4116 loceol = PL_regeol;
1aa99e6b
IH
4117 while (hardcount < max && scan < loceol &&
4118 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4119 scan += UTF8SKIP(scan);
4120 hardcount++;
4121 }
4122 } else {
4123 while (scan < loceol && isALNUM_LC(*scan))
4124 scan++;
a0ed51b3
LW
4125 }
4126 break;
a0d0e21e 4127 case NALNUM:
1aa99e6b 4128 if (do_utf8) {
ffc61ed2 4129 loceol = PL_regeol;
8269fa76 4130 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4131 while (hardcount < max && scan < loceol &&
3568d838 4132 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4133 scan += UTF8SKIP(scan);
4134 hardcount++;
4135 }
4136 } else {
4137 while (scan < loceol && !isALNUM(*scan))
4138 scan++;
a0ed51b3
LW
4139 }
4140 break;
bbce6d69 4141 case NALNUML:
3280af22 4142 PL_reg_flags |= RF_tainted;
1aa99e6b 4143 if (do_utf8) {
ffc61ed2 4144 loceol = PL_regeol;
1aa99e6b
IH
4145 while (hardcount < max && scan < loceol &&
4146 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4147 scan += UTF8SKIP(scan);
4148 hardcount++;
4149 }
4150 } else {
4151 while (scan < loceol && !isALNUM_LC(*scan))
4152 scan++;
a0ed51b3
LW
4153 }
4154 break;
a0d0e21e 4155 case SPACE:
1aa99e6b 4156 if (do_utf8) {
ffc61ed2 4157 loceol = PL_regeol;
8269fa76 4158 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4159 while (hardcount < max && scan < loceol &&
3568d838
JH
4160 (*scan == ' ' ||
4161 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4162 scan += UTF8SKIP(scan);
4163 hardcount++;
4164 }
4165 } else {
4166 while (scan < loceol && isSPACE(*scan))
4167 scan++;
a0ed51b3
LW
4168 }
4169 break;
bbce6d69 4170 case SPACEL:
3280af22 4171 PL_reg_flags |= RF_tainted;
1aa99e6b 4172 if (do_utf8) {
ffc61ed2 4173 loceol = PL_regeol;
1aa99e6b 4174 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4175 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4176 scan += UTF8SKIP(scan);
4177 hardcount++;
4178 }
4179 } else {
4180 while (scan < loceol && isSPACE_LC(*scan))
4181 scan++;
a0ed51b3
LW
4182 }
4183 break;
a0d0e21e 4184 case NSPACE:
1aa99e6b 4185 if (do_utf8) {
ffc61ed2 4186 loceol = PL_regeol;
8269fa76 4187 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4188 while (hardcount < max && scan < loceol &&
3568d838
JH
4189 !(*scan == ' ' ||
4190 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4191 scan += UTF8SKIP(scan);
4192 hardcount++;
4193 }
4194 } else {
4195 while (scan < loceol && !isSPACE(*scan))
4196 scan++;
4197 break;
a0ed51b3 4198 }
bbce6d69 4199 case NSPACEL:
3280af22 4200 PL_reg_flags |= RF_tainted;
1aa99e6b 4201 if (do_utf8) {
ffc61ed2 4202 loceol = PL_regeol;
1aa99e6b 4203 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4204 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4205 scan += UTF8SKIP(scan);
4206 hardcount++;
4207 }
4208 } else {
4209 while (scan < loceol && !isSPACE_LC(*scan))
4210 scan++;
a0ed51b3
LW
4211 }
4212 break;
a0d0e21e 4213 case DIGIT:
1aa99e6b 4214 if (do_utf8) {
ffc61ed2 4215 loceol = PL_regeol;
8269fa76 4216 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4217 while (hardcount < max && scan < loceol &&
3568d838 4218 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4219 scan += UTF8SKIP(scan);
4220 hardcount++;
4221 }
4222 } else {
4223 while (scan < loceol && isDIGIT(*scan))
4224 scan++;
a0ed51b3
LW
4225 }
4226 break;
a0d0e21e 4227 case NDIGIT:
1aa99e6b 4228 if (do_utf8) {
ffc61ed2 4229 loceol = PL_regeol;
8269fa76 4230 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4231 while (hardcount < max && scan < loceol &&
3568d838 4232 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4233 scan += UTF8SKIP(scan);
4234 hardcount++;
4235 }
4236 } else {
4237 while (scan < loceol && !isDIGIT(*scan))
4238 scan++;
a0ed51b3
LW
4239 }
4240 break;
a0d0e21e
LW
4241 default: /* Called on something of 0 width. */
4242 break; /* So match right here or not at all. */
4243 }
a687059c 4244
a0ed51b3
LW
4245 if (hardcount)
4246 c = hardcount;
4247 else
4248 c = scan - PL_reginput;
3280af22 4249 PL_reginput = scan;
a687059c 4250
9041c2e3 4251 DEBUG_r(
c277df42
IZ
4252 {
4253 SV *prop = sv_newmortal();
4254
4255 regprop(prop, p);
9041c2e3
NIS
4256 PerlIO_printf(Perl_debug_log,
4257 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4258 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4259 });
9041c2e3 4260
a0d0e21e 4261 return(c);
a687059c
LW
4262}
4263
4264/*
c277df42 4265 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4266 *
c277df42
IZ
4267 * The repeater is supposed to have constant length.
4268 */
4269
76e3520e 4270STATIC I32
cea2e8a9 4271S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4272{
b7953727 4273 register char *scan = Nullch;
c277df42 4274 register char *start;
3280af22 4275 register char *loceol = PL_regeol;
a0ed51b3 4276 I32 l = 0;
708e3b05 4277 I32 count = 0, res = 1;
a0ed51b3
LW
4278
4279 if (!max)
4280 return 0;
c277df42 4281
3280af22 4282 start = PL_reginput;
53c4c00c 4283 if (PL_reg_match_utf8) {
708e3b05 4284 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4285 if (!count++) {
4286 l = 0;
4287 while (start < PL_reginput) {
4288 l++;
4289 start += UTF8SKIP(start);
4290 }
4291 *lp = l;
4292 if (l == 0)
4293 return max;
4294 }
4295 if (count == max)
4296 return count;
4297 }
4298 }
4299 else {
708e3b05 4300 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4301 if (!count++) {
4302 *lp = l = PL_reginput - start;
4303 if (max != REG_INFTY && l*max < loceol - scan)
4304 loceol = scan + l*max;
4305 if (l == 0)
4306 return max;
c277df42
IZ
4307 }
4308 }
4309 }
708e3b05 4310 if (!res)
3280af22 4311 PL_reginput = scan;
9041c2e3 4312
a0ed51b3 4313 return count;
c277df42
IZ
4314}
4315
4316/*
ffc61ed2
JH
4317- regclass_swash - prepare the utf8 swash
4318*/
4319
4320SV *
9e55ce06 4321Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4322{
9e55ce06
JH
4323 SV *sw = NULL;
4324 SV *si = NULL;
4325 SV *alt = NULL;
ffc61ed2
JH
4326
4327 if (PL_regdata && PL_regdata->count) {
4328 U32 n = ARG(node);
4329
4330 if (PL_regdata->what[n] == 's') {
4331 SV *rv = (SV*)PL_regdata->data[n];
4332 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4333 SV **ary = AvARRAY(av);
9e55ce06 4334 SV **a, **b;
9041c2e3 4335
9e55ce06
JH
4336 /* See the end of regcomp.c:S_reglass() for
4337 * documentation of these array elements. */
4338
b11f357e
JH
4339 si = *ary;
4340 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4341 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4342
ffc61ed2
JH
4343 if (a)
4344 sw = *a;
4345 else if (si && doinit) {
4346 sw = swash_init("utf8", "", si, 1, 0);
4347 (void)av_store(av, 1, sw);
4348 }
9e55ce06
JH
4349 if (b)
4350 alt = *b;
ffc61ed2
JH
4351 }
4352 }
4353
9e55ce06
JH
4354 if (listsvp)
4355 *listsvp = si;
4356 if (altsvp)
4357 *altsvp = alt;
ffc61ed2
JH
4358
4359 return sw;
4360}
4361
4362/*
ba7b4546 4363 - reginclass - determine if a character falls into a character class
832705d4
JH
4364
4365 The n is the ANYOF regnode, the p is the target string, lenp
4366 is pointer to the maximum length of how far to go in the p
4367 (if the lenp is zero, UTF8SKIP(p) is used),
4368 do_utf8 tells whether the target string is in UTF-8.
4369
bbce6d69 4370 */
4371
76e3520e 4372STATIC bool
ba7b4546 4373S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4374{
ffc61ed2 4375 char flags = ANYOF_FLAGS(n);
bbce6d69 4376 bool match = FALSE;
cc07378b 4377 UV c = *p;
ae9ddab8 4378 STRLEN len = 0;
9e55ce06 4379 STRLEN plen;
1aa99e6b 4380
ae9ddab8
JH
4381 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4382 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4383 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4384
0f0076b4 4385 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4386 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4387 if (lenp)
4388 *lenp = 0;
ffc61ed2 4389 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4390 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4391 match = TRUE;
bbce6d69 4392 }
3568d838 4393 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4394 match = TRUE;
ffc61ed2 4395 if (!match) {
9e55ce06
JH
4396 AV *av;
4397 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4398
4399 if (sw) {
3568d838 4400 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4401 match = TRUE;
4402 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4403 if (!match && lenp && av) {
4404 I32 i;
4405
4406 for (i = 0; i <= av_len(av); i++) {
4407 SV* sv = *av_fetch(av, i, FALSE);
4408 STRLEN len;
4409 char *s = SvPV(sv, len);
4410
061b10df 4411 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4412 *lenp = len;
4413 match = TRUE;
4414 break;
4415 }
4416 }
4417 }
4418 if (!match) {
4a623e43
JH
4419 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4420 STRLEN tmplen;
4421
9e55ce06
JH
4422 to_utf8_fold(p, tmpbuf, &tmplen);
4423 if (swash_fetch(sw, tmpbuf, do_utf8))
4424 match = TRUE;
4425 }
ffc61ed2
JH
4426 }
4427 }
bbce6d69 4428 }
9e55ce06 4429 if (match && lenp && *lenp == 0)
0f0076b4 4430 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4431 }
1aa99e6b 4432 if (!match && c < 256) {
ffc61ed2
JH
4433 if (ANYOF_BITMAP_TEST(n, c))
4434 match = TRUE;
4435 else if (flags & ANYOF_FOLD) {
eb160463 4436 U8 f;
a0ed51b3 4437
ffc61ed2
JH
4438 if (flags & ANYOF_LOCALE) {
4439 PL_reg_flags |= RF_tainted;
4440 f = PL_fold_locale[c];
4441 }
4442 else
4443 f = PL_fold[c];
4444 if (f != c && ANYOF_BITMAP_TEST(n, f))
4445 match = TRUE;
4446 }
4447
4448 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4449 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4450 if (
4451 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4452 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4453 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4454 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4455 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4456 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4457 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4458 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4459 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4460 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4481 ) /* How's that for a conditional? */
4482 {
4483 match = TRUE;
4484 }
a0ed51b3 4485 }
a0ed51b3
LW
4486 }
4487
a0ed51b3
LW
4488 return (flags & ANYOF_INVERT) ? !match : match;
4489}
161b471a 4490
dfe13c55 4491STATIC U8 *
cea2e8a9 4492S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4493{
1aa99e6b
IH
4494 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4495}
4496
4497STATIC U8 *
4498S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4499{
a0ed51b3 4500 if (off >= 0) {
1aa99e6b 4501 while (off-- && s < lim) {
ffc61ed2 4502 /* XXX could check well-formedness here */
a0ed51b3 4503 s += UTF8SKIP(s);
ffc61ed2 4504 }
a0ed51b3
LW
4505 }
4506 else {
4507 while (off++) {
1aa99e6b 4508 if (s > lim) {
a0ed51b3 4509 s--;
ffc61ed2 4510 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4511 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4512 s--;
ffc61ed2
JH
4513 }
4514 /* XXX could check well-formedness here */
a0ed51b3
LW
4515 }
4516 }
4517 }
4518 return s;
4519}
161b471a 4520
dfe13c55 4521STATIC U8 *
1aa99e6b 4522S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4523{
1aa99e6b
IH
4524 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4525}
4526
4527STATIC U8 *
4528S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4529{
4530 if (off >= 0) {
1aa99e6b 4531 while (off-- && s < lim) {
ffc61ed2 4532 /* XXX could check well-formedness here */
a0ed51b3 4533 s += UTF8SKIP(s);
ffc61ed2 4534 }
a0ed51b3
LW
4535 if (off >= 0)
4536 return 0;
4537 }
4538 else {
4539 while (off++) {
1aa99e6b 4540 if (s > lim) {
a0ed51b3 4541 s--;
ffc61ed2 4542 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4543 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4544 s--;
ffc61ed2
JH
4545 }
4546 /* XXX could check well-formedness here */
a0ed51b3
LW
4547 }
4548 else
4549 break;
4550 }
4551 if (off <= 0)
4552 return 0;
4553 }
4554 return s;
4555}
51371543 4556
51371543 4557static void
acfe0abc 4558restore_pos(pTHX_ void *arg)
51371543 4559{
51371543
GS
4560 if (PL_reg_eval_set) {
4561 if (PL_reg_oldsaved) {
4562 PL_reg_re->subbeg = PL_reg_oldsaved;
4563 PL_reg_re->sublen = PL_reg_oldsavedlen;
4564 RX_MATCH_COPIED_on(PL_reg_re);
4565 }
4566 PL_reg_magic->mg_len = PL_reg_oldpos;
4567 PL_reg_eval_set = 0;
4568 PL_curpm = PL_reg_oldcurpm;
4569 }
4570}
33b8afdf
JH
4571
4572STATIC void
4573S_to_utf8_substr(pTHX_ register regexp *prog)
4574{
4575 SV* sv;
4576 if (prog->float_substr && !prog->float_utf8) {
4577 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4578 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4579 sv_utf8_upgrade(sv);
4580 if (SvTAIL(prog->float_substr))
4581 SvTAIL_on(sv);
4582 if (prog->float_substr == prog->check_substr)
4583 prog->check_utf8 = sv;
4584 }
4585 if (prog->anchored_substr && !prog->anchored_utf8) {
4586 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4587 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4588 sv_utf8_upgrade(sv);
4589 if (SvTAIL(prog->anchored_substr))
4590 SvTAIL_on(sv);
4591 if (prog->anchored_substr == prog->check_substr)
4592 prog->check_utf8 = sv;
4593 }
4594}
4595
4596STATIC void
4597S_to_byte_substr(pTHX_ register regexp *prog)
4598{
4599 SV* sv;
4600 if (prog->float_utf8 && !prog->float_substr) {
4601 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4602 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4603 if (sv_utf8_downgrade(sv, TRUE)) {
4604 if (SvTAIL(prog->float_utf8))
4605 SvTAIL_on(sv);
4606 } else {
4607 SvREFCNT_dec(sv);
4608 prog->float_substr = sv = &PL_sv_undef;
4609 }
4610 if (prog->float_utf8 == prog->check_utf8)
4611 prog->check_substr = sv;
4612 }
4613 if (prog->anchored_utf8 && !prog->anchored_substr) {
4614 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4615 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4616 if (sv_utf8_downgrade(sv, TRUE)) {
4617 if (SvTAIL(prog->anchored_utf8))
4618 SvTAIL_on(sv);
4619 } else {
4620 SvREFCNT_dec(sv);
4621 prog->anchored_substr = sv = &PL_sv_undef;
4622 }
4623 if (prog->anchored_utf8 == prog->check_utf8)
4624 prog->check_substr = sv;
4625 }
4626}