This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[FIX] Re: UTF-8 failures (surprise!)
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_regexec_flags my_regexec
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
cad2e5aa 38# define Perl_re_intuit_start my_re_intuit_start
d06ea78c
GS
39/* *These* symbols are masked to allow static link. */
40# define Perl_pregexec my_pregexec
9041c2e3 41# define Perl_reginitcolors my_reginitcolors
490a3f88 42# define Perl_regclass_swash my_regclass_swash
c5be433b
GS
43
44# define PERL_NO_GET_CONTEXT
9041c2e3 45#endif
56953603 46
f0fcb552 47/*SUPPRESS 112*/
a687059c 48/*
e50aee73 49 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
50 *
51 * Copyright (c) 1986 by University of Toronto.
52 * Written by Henry Spencer. Not derived from licensed software.
53 *
54 * Permission is granted to anyone to use this software for any
55 * purpose on any computer system, and to redistribute it freely,
56 * subject to the following restrictions:
57 *
58 * 1. The author is not responsible for the consequences of use of
59 * this software, no matter how awful, even if they arise
60 * from defects in it.
61 *
62 * 2. The origin of this software must not be misrepresented, either
63 * by explicit claim or by omission.
64 *
65 * 3. Altered versions must be plainly marked as such, and must not
66 * be misrepresented as being the original software.
67 *
68 **** Alterations to Henry's code are...
69 ****
be3c0a43 70 **** Copyright (c) 1991-2002, Larry Wall
a687059c 71 ****
9ef589d8
LW
72 **** You may distribute under the terms of either the GNU General Public
73 **** License or the Artistic License, as specified in the README file.
a687059c
LW
74 *
75 * Beware that some of this code is subtly aware of the way operator
76 * precedence is structured in regular expressions. Serious changes in
77 * regular-expression syntax might require a total rethink.
78 */
79#include "EXTERN.h"
864dbfa3 80#define PERL_IN_REGEXEC_C
a687059c 81#include "perl.h"
0f5d15d6 82
a687059c
LW
83#include "regcomp.h"
84
c277df42
IZ
85#define RF_tainted 1 /* tainted information used? */
86#define RF_warned 2 /* warned about big count? */
ce862d02 87#define RF_evaled 4 /* Did an EVAL with setting? */
a0ed51b3
LW
88#define RF_utf8 8 /* String contains multibyte chars? */
89
eb160463 90#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
91
92#define RS_init 1 /* eval environment created */
93#define RS_set 2 /* replsv value is set */
c277df42 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
ba7b4546 99#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
5dec093f 100#define REGINCLASS_utf8(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,1) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 101
c277df42
IZ
102/*
103 * Forwards.
104 */
105
33b8afdf 106#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 107#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 108
dfe13c55
GS
109#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
110#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
53c4c00c
JH
111#define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
112#define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
dfe13c55
GS
113#define HOPc(pos,off) ((char*)HOP(pos,off))
114#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
a0d0e21e 115
efb30f32 116#define HOPBACK(pos, off) ( \
e54858b0 117 (PL_reg_match_utf8) \
efb30f32
HS
118 ? reghopmaybe((U8*)pos, -off) \
119 : (pos - off >= PL_bostr) \
120 ? (U8*)(pos - off) \
121 : (U8*)NULL \
122)
123#define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
124
1aa99e6b
IH
125#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
126#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
53c4c00c
JH
127#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
128#define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b
IH
129#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
130#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
131
977e4b84 132#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
51371543 133
5f80c4cf 134/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
135#define JUMPABLE(rn) ( \
136 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
137 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
138 OP(rn) == PLUS || OP(rn) == MINMOD || \
139 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
140)
141
cca55fe3
JP
142#define HAS_TEXT(rn) ( \
143 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
144)
e2d8ce26 145
a84d97b6
HS
146/*
147 Search for mandatory following text node; for lookahead, the text must
148 follow but for lookbehind (rn->flags != 0) we skip to the next step.
149*/
cca55fe3 150#define FIND_NEXT_IMPT(rn) STMT_START { \
e2d8ce26 151 while (JUMPABLE(rn)) \
a84d97b6 152 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
e2d8ce26 153 rn = NEXTOPER(NEXTOPER(rn)); \
cca55fe3
JP
154 else if (OP(rn) == PLUS) \
155 rn = NEXTOPER(rn); \
a84d97b6
HS
156 else if (OP(rn) == IFMATCH) \
157 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 158 else rn += NEXT_OFF(rn); \
5f80c4cf 159} STMT_END
74750237 160
acfe0abc 161static void restore_pos(pTHX_ void *arg);
51371543 162
76e3520e 163STATIC CHECKPOINT
cea2e8a9 164S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 165{
3280af22 166 int retval = PL_savestack_ix;
b1ce53c5
JH
167#define REGCP_PAREN_ELEMS 4
168 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e
LW
169 int p;
170
e49a9654
IH
171 if (paren_elems_to_push < 0)
172 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
173
a01268b5 174#define REGCP_OTHER_ELEMS 6
b1ce53c5 175 SSCHECK(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 176 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 177/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
178 SSPUSHINT(PL_regendp[p]);
179 SSPUSHINT(PL_regstartp[p]);
3280af22 180 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e
LW
181 SSPUSHINT(p);
182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
02db2b7b 198# define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
faccc32b 199 " Setting an EVAL scope, savestack=%"IVdf"\n", \
02db2b7b 200 (IV)PL_savestack_ix)); cp = PL_savestack_ix
c3464db5 201
02db2b7b 202# define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
c3464db5 203 PerlIO_printf(Perl_debug_log, \
faccc32b 204 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
02db2b7b 205 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
c277df42 206
76e3520e 207STATIC char *
cea2e8a9 208S_regcppop(pTHX)
a0d0e21e 209{
b1ce53c5 210 I32 i;
a0d0e21e
LW
211 U32 paren = 0;
212 char *input;
cf93c79d 213 I32 tmps;
b1ce53c5
JH
214
215 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 216 i = SSPOPINT;
b1ce53c5
JH
217 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
218 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 219 input = (char *) SSPOPPTR;
a01268b5 220 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
221 *PL_reglastparen = SSPOPINT;
222 PL_regsize = SSPOPINT;
b1ce53c5
JH
223
224 /* Now restore the parentheses context. */
41123dfd
JH
225 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
226 i > 0; i -= REGCP_PAREN_ELEMS) {
a0d0e21e 227 paren = (U32)SSPOPINT;
3280af22 228 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
229 PL_regstartp[paren] = SSPOPINT;
230 tmps = SSPOPINT;
3280af22
NIS
231 if (paren <= *PL_reglastparen)
232 PL_regendp[paren] = tmps;
c277df42 233 DEBUG_r(
c3464db5 234 PerlIO_printf(Perl_debug_log,
b900a521 235 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 236 (UV)paren, (IV)PL_regstartp[paren],
b900a521 237 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 238 (IV)PL_regendp[paren],
3280af22 239 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 240 );
a0d0e21e 241 }
c277df42 242 DEBUG_r(
eb160463 243 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
c3464db5 244 PerlIO_printf(Perl_debug_log,
faccc32b
JH
245 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
246 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
c277df42
IZ
247 }
248 );
daf18116 249#if 1
dafc8851
JH
250 /* It would seem that the similar code in regtry()
251 * already takes care of this, and in fact it is in
252 * a better location to since this code can #if 0-ed out
253 * but the code in regtry() is needed or otherwise tests
254 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
255 * (as of patchlevel 7877) will fail. Then again,
256 * this code seems to be necessary or otherwise
257 * building DynaLoader will fail:
258 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
259 * --jhi */
eb160463
GS
260 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
261 if ((I32)paren > PL_regsize)
cf93c79d
IZ
262 PL_regstartp[paren] = -1;
263 PL_regendp[paren] = -1;
a0d0e21e 264 }
dafc8851 265#endif
a0d0e21e
LW
266 return input;
267}
268
0f5d15d6 269STATIC char *
cea2e8a9 270S_regcp_set_to(pTHX_ I32 ss)
0f5d15d6
IZ
271{
272 I32 tmp = PL_savestack_ix;
273
274 PL_savestack_ix = ss;
275 regcppop();
276 PL_savestack_ix = tmp;
942e002e 277 return Nullch;
0f5d15d6
IZ
278}
279
280typedef struct re_cc_state
281{
282 I32 ss;
283 regnode *node;
284 struct re_cc_state *prev;
285 CURCUR *cc;
286 regexp *re;
287} re_cc_state;
288
02db2b7b 289#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 290
29d1e993
HS
291#define TRYPAREN(paren, n, input) { \
292 if (paren) { \
293 if (n) { \
294 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
295 PL_regendp[paren] = input - PL_bostr; \
296 } \
297 else \
298 PL_regendp[paren] = -1; \
299 } \
300 if (regmatch(next)) \
301 sayYES; \
302 if (paren && n) \
303 PL_regendp[paren] = -1; \
304}
305
306
a687059c 307/*
e50aee73 308 * pregexec and friends
a687059c
LW
309 */
310
311/*
c277df42 312 - pregexec - match a regexp against a string
a687059c 313 */
c277df42 314I32
864dbfa3 315Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 316 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
317/* strend: pointer to null at end of string */
318/* strbeg: real beginning of string */
319/* minend: end of match must be >=minend after stringarg. */
320/* nosave: For optimizations. */
321{
322 return
9041c2e3 323 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
324 nosave ? 0 : REXEC_COPY_STR);
325}
0f5d15d6
IZ
326
327STATIC void
cea2e8a9 328S_cache_re(pTHX_ regexp *prog)
0f5d15d6
IZ
329{
330 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
331#ifdef DEBUGGING
332 PL_regprogram = prog->program;
333#endif
334 PL_regnpar = prog->nparens;
9041c2e3
NIS
335 PL_regdata = prog->data;
336 PL_reg_re = prog;
0f5d15d6 337}
22e551b9 338
9041c2e3 339/*
cad2e5aa
JH
340 * Need to implement the following flags for reg_anch:
341 *
342 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
343 * USE_INTUIT_ML
344 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
345 * INTUIT_AUTORITATIVE_ML
346 * INTUIT_ONCE_NOML - Intuit can match in one location only.
347 * INTUIT_ONCE_ML
348 *
349 * Another flag for this function: SECOND_TIME (so that float substrs
350 * with giant delta may be not rechecked).
351 */
352
353/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
354
2c2d71f5 355/* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
cad2e5aa
JH
356 Otherwise, only SvCUR(sv) is used to get strbeg. */
357
358/* XXXX We assume that strpos is strbeg unless sv. */
359
6eb5f6b9
JH
360/* XXXX Some places assume that there is a fixed substring.
361 An update may be needed if optimizer marks as "INTUITable"
362 RExen without fixed substrings. Similarly, it is assumed that
363 lengths of all the strings are no more than minlen, thus they
364 cannot come from lookahead.
365 (Or minlen should take into account lookahead.) */
366
2c2d71f5
JH
367/* A failure to find a constant substring means that there is no need to make
368 an expensive call to REx engine, thus we celebrate a failure. Similarly,
369 finding a substring too deep into the string means that less calls to
30944b6d
IZ
370 regtry() should be needed.
371
372 REx compiler's optimizer found 4 possible hints:
373 a) Anchored substring;
374 b) Fixed substring;
375 c) Whether we are anchored (beginning-of-line or \G);
376 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 377 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
378 string which does not contradict any of them.
379 */
2c2d71f5 380
6eb5f6b9
JH
381/* Most of decisions we do here should have been done at compile time.
382 The nodes of the REx which we used for the search should have been
383 deleted from the finite automaton. */
384
cad2e5aa
JH
385char *
386Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
387 char *strend, U32 flags, re_scream_pos_data *data)
388{
b7953727 389 register I32 start_shift = 0;
cad2e5aa 390 /* Should be nonnegative! */
b7953727 391 register I32 end_shift = 0;
2c2d71f5
JH
392 register char *s;
393 register SV *check;
a1933d95 394 char *strbeg;
cad2e5aa 395 char *t;
33b8afdf 396 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 397 I32 ml_anch;
6eb5f6b9 398 register char *other_last = Nullch; /* other substr checked before this */
b7953727 399 char *check_at = Nullch; /* check substr found at this pos */
30944b6d
IZ
400#ifdef DEBUGGING
401 char *i_strpos = strpos;
ce333219 402 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
30944b6d 403#endif
cad2e5aa 404
b8d68ded
JH
405 if (prog->reganch & ROPT_UTF8) {
406 DEBUG_r(PerlIO_printf(Perl_debug_log,
407 "UTF-8 regex...\n"));
408 PL_reg_flags |= RF_utf8;
409 }
410
2a782b5b 411 DEBUG_r({
b8d68ded 412 char *s = PL_reg_match_utf8 ?
c728cb41
JH
413 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
414 strpos;
b8d68ded
JH
415 int len = PL_reg_match_utf8 ?
416 strlen(s) : strend - strpos;
2a782b5b
JH
417 if (!PL_colorset)
418 reginitcolors();
b8d68ded
JH
419 if (PL_reg_match_utf8)
420 DEBUG_r(PerlIO_printf(Perl_debug_log,
421 "UTF-8 target...\n"));
2a782b5b
JH
422 PerlIO_printf(Perl_debug_log,
423 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
424 PL_colors[4],PL_colors[5],PL_colors[0],
425 prog->precomp,
426 PL_colors[1],
427 (strlen(prog->precomp) > 60 ? "..." : ""),
428 PL_colors[0],
429 (int)(len > 60 ? 60 : len),
430 s, PL_colors[1],
431 (len > 60 ? "..." : "")
432 );
433 });
cad2e5aa 434
c344f387
JH
435 /* CHR_DIST() would be more correct here but it makes things slow. */
436 if (prog->minlen > strend - strpos) {
a72c7584
JH
437 DEBUG_r(PerlIO_printf(Perl_debug_log,
438 "String too short... [re_intuit_start]\n"));
cad2e5aa 439 goto fail;
2c2d71f5 440 }
a1933d95 441 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 442 PL_regeol = strend;
33b8afdf
JH
443 if (do_utf8) {
444 if (!prog->check_utf8 && prog->check_substr)
445 to_utf8_substr(prog);
446 check = prog->check_utf8;
447 } else {
448 if (!prog->check_substr && prog->check_utf8)
449 to_byte_substr(prog);
450 check = prog->check_substr;
451 }
452 if (check == &PL_sv_undef) {
453 DEBUG_r(PerlIO_printf(Perl_debug_log,
454 "Non-utf string cannot match utf check string\n"));
455 goto fail;
456 }
2c2d71f5 457 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
458 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
459 || ( (prog->reganch & ROPT_ANCH_BOL)
2c2d71f5 460 && !PL_multiline ) ); /* Check after \n? */
cad2e5aa 461
7e25d62c
JH
462 if (!ml_anch) {
463 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
464 | ROPT_IMPLICIT)) /* not a real BOL */
465 /* SvCUR is not set on references: SvRV and SvPVX overlap */
466 && sv && !SvROK(sv)
467 && (strpos != strbeg)) {
468 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
469 goto fail;
470 }
471 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 472 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 473 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
474 I32 slen;
475
1aa99e6b 476 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
477 if (SvTAIL(check)) {
478 slen = SvCUR(check); /* >= 1 */
cad2e5aa 479
9041c2e3 480 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5
JH
481 || (strend - s == slen && strend[-1] != '\n')) {
482 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
483 goto fail_finish;
cad2e5aa
JH
484 }
485 /* Now should match s[0..slen-2] */
486 slen--;
653099ff 487 if (slen && (*SvPVX(check) != *s
cad2e5aa 488 || (slen > 1
653099ff 489 && memNE(SvPVX(check), s, slen)))) {
2c2d71f5
JH
490 report_neq:
491 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
492 goto fail_finish;
493 }
cad2e5aa 494 }
653099ff
GS
495 else if (*SvPVX(check) != *s
496 || ((slen = SvCUR(check)) > 1
497 && memNE(SvPVX(check), s, slen)))
2c2d71f5
JH
498 goto report_neq;
499 goto success_at_start;
7e25d62c 500 }
cad2e5aa 501 }
2c2d71f5 502 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 503 s = strpos;
2c2d71f5 504 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 505 end_shift = prog->minlen - start_shift -
653099ff 506 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 507 if (!ml_anch) {
653099ff
GS
508 I32 end = prog->check_offset_max + CHR_SVLEN(check)
509 - (SvTAIL(check) != 0);
1aa99e6b 510 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
511
512 if (end_shift < eshift)
513 end_shift = eshift;
514 }
cad2e5aa 515 }
2c2d71f5 516 else { /* Can match at random position */
cad2e5aa
JH
517 ml_anch = 0;
518 s = strpos;
2c2d71f5
JH
519 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
520 /* Should be nonnegative! */
521 end_shift = prog->minlen - start_shift -
653099ff 522 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
523 }
524
2c2d71f5 525#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 526 if (end_shift < 0)
6bbae5e6 527 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
528#endif
529
2c2d71f5
JH
530 restart:
531 /* Find a possible match in the region s..strend by looking for
532 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 533 if (flags & REXEC_SCREAM) {
cad2e5aa
JH
534 I32 p = -1; /* Internal iterator of scream. */
535 I32 *pp = data ? data->scream_pos : &p;
536
2c2d71f5
JH
537 if (PL_screamfirst[BmRARE(check)] >= 0
538 || ( BmRARE(check) == '\n'
539 && (BmPREVIOUS(check) == SvCUR(check) - 1)
540 && SvTAIL(check) ))
9041c2e3 541 s = screaminstr(sv, check,
2c2d71f5 542 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 543 else
2c2d71f5 544 goto fail_finish;
4addbd3b
HS
545 /* we may be pointing at the wrong string */
546 if (s && RX_MATCH_COPIED(prog))
547 s = prog->subbeg + (s - SvPVX(sv));
cad2e5aa
JH
548 if (data)
549 *data->scream_olds = s;
550 }
f33976b4 551 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
552 s = fbm_instr((U8*)(s + start_shift),
553 (U8*)(strend - end_shift),
554 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 555 else
1aa99e6b
IH
556 s = fbm_instr(HOP3(s, start_shift, strend),
557 HOP3(strend, -end_shift, strbeg),
2c2d71f5 558 check, PL_multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
559
560 /* Update the count-of-usability, remove useless subpatterns,
561 unshift s. */
2c2d71f5
JH
562
563 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
564 (s ? "Found" : "Did not find"),
33b8afdf 565 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
2c2d71f5 566 PL_colors[0],
7b0972df
JH
567 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
568 SvPVX(check),
2c2d71f5
JH
569 PL_colors[1], (SvTAIL(check) ? "$" : ""),
570 (s ? " at offset " : "...\n") ) );
571
572 if (!s)
573 goto fail_finish;
574
6eb5f6b9
JH
575 check_at = s;
576
2c2d71f5 577 /* Finish the diagnostic message */
30944b6d 578 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
579
580 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
581 Start with the other substr.
582 XXXX no SCREAM optimization yet - and a very coarse implementation
583 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
584 *always* match. Probably should be marked during compile...
585 Probably it is right to do no SCREAM here...
586 */
587
33b8afdf 588 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 589 /* Take into account the "other" substring. */
2c2d71f5
JH
590 /* XXXX May be hopelessly wrong for UTF... */
591 if (!other_last)
6eb5f6b9 592 other_last = strpos;
33b8afdf 593 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
594 do_other_anchored:
595 {
1aa99e6b 596 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
2c2d71f5 597 char *s1 = s;
33b8afdf 598 SV* must;
2c2d71f5 599
2c2d71f5
JH
600 t = s - prog->check_offset_max;
601 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 602 && (!do_utf8
1aa99e6b 603 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
2c2d71f5 604 && t > strpos)))
30944b6d 605 /* EMPTY */;
2c2d71f5
JH
606 else
607 t = strpos;
1aa99e6b 608 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
609 if (t < other_last) /* These positions already checked */
610 t = other_last;
1aa99e6b 611 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
612 if (last < last1)
613 last1 = last;
614 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
615 /* On end-of-str: see comment below. */
33b8afdf
JH
616 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
617 if (must == &PL_sv_undef) {
618 s = (char*)NULL;
619 DEBUG_r(must = prog->anchored_utf8); /* for debug */
620 }
621 else
622 s = fbm_instr(
623 (unsigned char*)t,
624 HOP3(HOP3(last1, prog->anchored_offset, strend)
625 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
626 must,
627 PL_multiline ? FBMrf_MULTILINE : 0
628 );
1aa99e6b
IH
629 DEBUG_r(PerlIO_printf(Perl_debug_log,
630 "%s anchored substr `%s%.*s%s'%s",
2c2d71f5
JH
631 (s ? "Found" : "Contradicts"),
632 PL_colors[0],
33b8afdf
JH
633 (int)(SvCUR(must)
634 - (SvTAIL(must)!=0)),
635 SvPVX(must),
636 PL_colors[1], (SvTAIL(must) ? "$" : "")));
2c2d71f5
JH
637 if (!s) {
638 if (last1 >= last2) {
639 DEBUG_r(PerlIO_printf(Perl_debug_log,
640 ", giving up...\n"));
641 goto fail_finish;
642 }
643 DEBUG_r(PerlIO_printf(Perl_debug_log,
644 ", trying floating at offset %ld...\n",
1aa99e6b
IH
645 (long)(HOP3c(s1, 1, strend) - i_strpos)));
646 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
647 s = HOP3c(last, 1, strend);
2c2d71f5
JH
648 goto restart;
649 }
650 else {
651 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 652 (long)(s - i_strpos)));
1aa99e6b
IH
653 t = HOP3c(s, -prog->anchored_offset, strbeg);
654 other_last = HOP3c(s, 1, strend);
30944b6d 655 s = s1;
2c2d71f5
JH
656 if (t == strpos)
657 goto try_at_start;
2c2d71f5
JH
658 goto try_at_offset;
659 }
30944b6d 660 }
2c2d71f5
JH
661 }
662 else { /* Take into account the floating substring. */
33b8afdf
JH
663 char *last, *last1;
664 char *s1 = s;
665 SV* must;
666
667 t = HOP3c(s, -start_shift, strbeg);
668 last1 = last =
669 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
670 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
671 last = HOP3c(t, prog->float_max_offset, strend);
672 s = HOP3c(t, prog->float_min_offset, strend);
673 if (s < other_last)
674 s = other_last;
2c2d71f5 675 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
676 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
677 /* fbm_instr() takes into account exact value of end-of-str
678 if the check is SvTAIL(ed). Since false positives are OK,
679 and end-of-str is not later than strend we are OK. */
680 if (must == &PL_sv_undef) {
681 s = (char*)NULL;
682 DEBUG_r(must = prog->float_utf8); /* for debug message */
683 }
684 else
2c2d71f5 685 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
686 (unsigned char*)last + SvCUR(must)
687 - (SvTAIL(must)!=0),
688 must, PL_multiline ? FBMrf_MULTILINE : 0);
689 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
690 (s ? "Found" : "Contradicts"),
691 PL_colors[0],
692 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
693 SvPVX(must),
694 PL_colors[1], (SvTAIL(must) ? "$" : "")));
695 if (!s) {
696 if (last1 == last) {
2c2d71f5 697 DEBUG_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
698 ", giving up...\n"));
699 goto fail_finish;
2c2d71f5 700 }
33b8afdf
JH
701 DEBUG_r(PerlIO_printf(Perl_debug_log,
702 ", trying anchored starting at offset %ld...\n",
703 (long)(s1 + 1 - i_strpos)));
704 other_last = last;
705 s = HOP3c(t, 1, strend);
706 goto restart;
707 }
708 else {
709 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
710 (long)(s - i_strpos)));
711 other_last = s; /* Fix this later. --Hugo */
712 s = s1;
713 if (t == strpos)
714 goto try_at_start;
715 goto try_at_offset;
716 }
2c2d71f5 717 }
cad2e5aa 718 }
2c2d71f5
JH
719
720 t = s - prog->check_offset_max;
2c2d71f5 721 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 722 && (!do_utf8
1aa99e6b
IH
723 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
724 && t > strpos))) {
2c2d71f5
JH
725 /* Fixed substring is found far enough so that the match
726 cannot start at strpos. */
727 try_at_offset:
cad2e5aa 728 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
729 /* Eventually fbm_*() should handle this, but often
730 anchored_offset is not 0, so this check will not be wasted. */
731 /* XXXX In the code below we prefer to look for "^" even in
732 presence of anchored substrings. And we search even
733 beyond the found float position. These pessimizations
734 are historical artefacts only. */
735 find_anchor:
2c2d71f5 736 while (t < strend - prog->minlen) {
cad2e5aa 737 if (*t == '\n') {
4ee3650e 738 if (t < check_at - prog->check_offset_min) {
33b8afdf 739 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
740 /* Since we moved from the found position,
741 we definitely contradict the found anchored
30944b6d
IZ
742 substr. Due to the above check we do not
743 contradict "check" substr.
744 Thus we can arrive here only if check substr
745 is float. Redo checking for "other"=="fixed".
746 */
9041c2e3 747 strpos = t + 1;
30944b6d
IZ
748 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
749 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
750 goto do_other_anchored;
751 }
4ee3650e
GS
752 /* We don't contradict the found floating substring. */
753 /* XXXX Why not check for STCLASS? */
cad2e5aa 754 s = t + 1;
2c2d71f5 755 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
30944b6d 756 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
757 goto set_useful;
758 }
4ee3650e
GS
759 /* Position contradicts check-string */
760 /* XXXX probably better to look for check-string
761 than for "\n", so one should lower the limit for t? */
762 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
30944b6d 763 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 764 other_last = strpos = s = t + 1;
cad2e5aa
JH
765 goto restart;
766 }
767 t++;
768 }
2c2d71f5
JH
769 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
770 PL_colors[0],PL_colors[1]));
771 goto fail_finish;
cad2e5aa 772 }
f5952150
GS
773 else {
774 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
775 PL_colors[0],PL_colors[1]));
776 }
cad2e5aa
JH
777 s = t;
778 set_useful:
33b8afdf 779 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
780 }
781 else {
f5952150 782 /* The found string does not prohibit matching at strpos,
2c2d71f5 783 - no optimization of calling REx engine can be performed,
f5952150
GS
784 unless it was an MBOL and we are not after MBOL,
785 or a future STCLASS check will fail this. */
2c2d71f5
JH
786 try_at_start:
787 /* Even in this situation we may use MBOL flag if strpos is offset
788 wrt the start of the string. */
05b4157f 789 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 790 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
791 /* May be due to an implicit anchor of m{.*foo} */
792 && !(prog->reganch & ROPT_IMPLICIT))
793 {
cad2e5aa
JH
794 t = strpos;
795 goto find_anchor;
796 }
30944b6d 797 DEBUG_r( if (ml_anch)
f5952150
GS
798 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
799 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
30944b6d 800 );
2c2d71f5 801 success_at_start:
30944b6d 802 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
803 && (do_utf8 ? (
804 prog->check_utf8 /* Could be deleted already */
805 && --BmUSEFUL(prog->check_utf8) < 0
806 && (prog->check_utf8 == prog->float_utf8)
807 ) : (
808 prog->check_substr /* Could be deleted already */
809 && --BmUSEFUL(prog->check_substr) < 0
810 && (prog->check_substr == prog->float_substr)
811 )))
66e933ab 812 {
cad2e5aa 813 /* If flags & SOMETHING - do not do it many times on the same match */
f5952150 814 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
815 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
816 if (do_utf8 ? prog->check_substr : prog->check_utf8)
817 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
818 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
819 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
5e39e1e5 820 check = Nullsv; /* abort */
cad2e5aa 821 s = strpos;
3cf5c195
IZ
822 /* XXXX This is a remnant of the old implementation. It
823 looks wasteful, since now INTUIT can use many
6eb5f6b9 824 other heuristics. */
cad2e5aa
JH
825 prog->reganch &= ~RE_USE_INTUIT;
826 }
827 else
828 s = strpos;
829 }
830
6eb5f6b9
JH
831 /* Last resort... */
832 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
833 if (prog->regstclass) {
834 /* minlen == 0 is possible if regstclass is \b or \B,
835 and the fixed substr is ''$.
836 Since minlen is already taken into account, s+1 is before strend;
837 accidentally, minlen >= 1 guaranties no false positives at s + 1
838 even for \b or \B. But (minlen? 1 : 0) below assumes that
839 regstclass does not come from lookahead... */
840 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
841 This leaves EXACTF only, which is dealt with in find_byclass(). */
1aa99e6b 842 U8* str = (U8*)STRING(prog->regstclass);
66e933ab 843 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
1aa99e6b 844 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 845 : 1);
33b8afdf 846 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 847 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 848 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
849 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
850 cl_l, strend)
851 : strend);
a1933d95 852 char *startpos = strbeg;
6eb5f6b9
JH
853
854 t = s;
76384e4a 855 if (prog->reganch & ROPT_UTF8) {
ffc61ed2 856 PL_regdata = prog->data;
76384e4a
GS
857 PL_bostr = startpos;
858 }
f33976b4 859 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
6eb5f6b9
JH
860 if (!s) {
861#ifdef DEBUGGING
b7953727 862 char *what = 0;
6eb5f6b9
JH
863#endif
864 if (endpos == strend) {
865 DEBUG_r( PerlIO_printf(Perl_debug_log,
866 "Could not match STCLASS...\n") );
867 goto fail;
868 }
66e933ab
GS
869 DEBUG_r( PerlIO_printf(Perl_debug_log,
870 "This position contradicts STCLASS...\n") );
653099ff
GS
871 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
872 goto fail;
6eb5f6b9 873 /* Contradict one of substrings */
33b8afdf
JH
874 if (prog->anchored_substr || prog->anchored_utf8) {
875 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
6eb5f6b9
JH
876 DEBUG_r( what = "anchored" );
877 hop_and_restart:
1aa99e6b 878 s = HOP3c(t, 1, strend);
66e933ab
GS
879 if (s + start_shift + end_shift > strend) {
880 /* XXXX Should be taken into account earlier? */
881 DEBUG_r( PerlIO_printf(Perl_debug_log,
882 "Could not match STCLASS...\n") );
883 goto fail;
884 }
5e39e1e5
HS
885 if (!check)
886 goto giveup;
6eb5f6b9 887 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 888 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
889 what, (long)(s + start_shift - i_strpos)) );
890 goto restart;
891 }
66e933ab 892 /* Have both, check_string is floating */
6eb5f6b9
JH
893 if (t + start_shift >= check_at) /* Contradicts floating=check */
894 goto retry_floating_check;
895 /* Recheck anchored substring, but not floating... */
9041c2e3 896 s = check_at;
5e39e1e5
HS
897 if (!check)
898 goto giveup;
6eb5f6b9 899 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150 900 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
901 (long)(other_last - i_strpos)) );
902 goto do_other_anchored;
903 }
60e71179
GS
904 /* Another way we could have checked stclass at the
905 current position only: */
906 if (ml_anch) {
907 s = t = t + 1;
5e39e1e5
HS
908 if (!check)
909 goto giveup;
60e71179 910 DEBUG_r( PerlIO_printf(Perl_debug_log,
f5952150
GS
911 "Looking for /%s^%s/m starting at offset %ld...\n",
912 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
60e71179 913 goto try_at_offset;
66e933ab 914 }
33b8afdf 915 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 916 goto fail;
6eb5f6b9
JH
917 /* Check is floating subtring. */
918 retry_floating_check:
919 t = check_at - start_shift;
920 DEBUG_r( what = "floating" );
921 goto hop_and_restart;
922 }
b7953727
JH
923 if (t != s) {
924 DEBUG_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 925 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
926 (long)(t - i_strpos), (long)(s - i_strpos))
927 );
928 }
929 else {
930 DEBUG_r(PerlIO_printf(Perl_debug_log,
931 "Does not contradict STCLASS...\n");
932 );
933 }
6eb5f6b9 934 }
5e39e1e5
HS
935 giveup:
936 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
937 PL_colors[4], (check ? "Guessed" : "Giving up"),
938 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 939 return s;
2c2d71f5
JH
940
941 fail_finish: /* Substring not found */
33b8afdf
JH
942 if (prog->check_substr || prog->check_utf8) /* could be removed already */
943 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 944 fail:
2c2d71f5 945 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
cad2e5aa
JH
946 PL_colors[4],PL_colors[5]));
947 return Nullch;
948}
9661b544 949
6eb5f6b9 950/* We know what class REx starts with. Try to find this position... */
3c3eec57
GS
951STATIC char *
952S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
a687059c 953{
6eb5f6b9
JH
954 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
955 char *m;
d8093b23
G
956 STRLEN ln;
957 unsigned int c1;
958 unsigned int c2;
6eb5f6b9
JH
959 char *e;
960 register I32 tmp = 1; /* Scratch variable? */
53c4c00c 961 register bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 962
6eb5f6b9
JH
963 /* We know what class it must start with. */
964 switch (OP(c)) {
6eb5f6b9 965 case ANYOF:
388cc4de
HS
966 if (do_utf8) {
967 while (s < strend) {
968 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
969 !UTF8_IS_INVARIANT((U8)s[0]) ?
970 reginclass(c, (U8*)s, 0, do_utf8) :
971 REGINCLASS(c, (U8*)s)) {
972 if (tmp && (norun || regtry(prog, s)))
973 goto got_it;
974 else
975 tmp = doevery;
976 }
977 else
978 tmp = 1;
979 s += UTF8SKIP(s);
980 }
981 }
982 else {
983 while (s < strend) {
984 STRLEN skip = 1;
985
986 if (REGINCLASS(c, (U8*)s) ||
987 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
988 /* The assignment of 2 is intentional:
989 * for the folded sharp s, the skip is 2. */
990 (skip = SHARP_S_SKIP))) {
991 if (tmp && (norun || regtry(prog, s)))
992 goto got_it;
993 else
994 tmp = doevery;
995 }
996 else
997 tmp = 1;
998 s += skip;
999 }
a0d0e21e 1000 }
6eb5f6b9 1001 break;
f33976b4
DB
1002 case CANY:
1003 while (s < strend) {
1004 if (tmp && (norun || regtry(prog, s)))
1005 goto got_it;
1006 else
1007 tmp = doevery;
1008 s++;
1009 }
1010 break;
6eb5f6b9
JH
1011 case EXACTF:
1012 m = STRING(c);
1013 ln = STR_LEN(c);
1aa99e6b 1014 if (UTF) {
a2a2844f 1015 STRLEN ulen1, ulen2;
e7ae6809
JH
1016 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
1017 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
1018
1019 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1020 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1021
872c91ae
JH
1022 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
1023 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1024 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
1025 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1aa99e6b
IH
1026 }
1027 else {
1028 c1 = *(U8*)m;
1029 c2 = PL_fold[c1];
1030 }
6eb5f6b9
JH
1031 goto do_exactf;
1032 case EXACTFL:
1033 m = STRING(c);
1034 ln = STR_LEN(c);
d8093b23 1035 c1 = *(U8*)m;
6eb5f6b9
JH
1036 c2 = PL_fold_locale[c1];
1037 do_exactf:
6795a346 1038 e = HOP3c(strend, -(I32)ln, s);
b3c9acc1 1039
6eb5f6b9
JH
1040 if (norun && e < s)
1041 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1042
60a8b682
JH
1043 /* The idea in the EXACTF* cases is to first find the
1044 * first character of the EXACTF* node and then, if
1045 * necessary, case-insensitively compare the full
1046 * text of the node. The c1 and c2 are the first
1047 * characters (though in Unicode it gets a bit
1048 * more complicated because there are more cases
7f16dd3d
JH
1049 * than just upper and lower: one needs to use
1050 * the so-called folding case for case-insensitive
1051 * matching (called "loose matching" in Unicode).
1052 * ibcmp_utf8() will do just that. */
60a8b682 1053
1aa99e6b 1054 if (do_utf8) {
575cac57
JH
1055 UV c, f;
1056 U8 tmpbuf [UTF8_MAXLEN+1];
1057 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
1058 STRLEN len, foldlen;
d7f013c8 1059
09091399 1060 if (c1 == c2) {
1aa99e6b 1061 while (s <= e) {
872c91ae
JH
1062 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1063 ckWARN(WARN_UTF8) ?
1064 0 : UTF8_ALLOW_ANY);
80aecb99
JH
1065 if ( c == c1
1066 && (ln == len ||
66423254 1067 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1068 m, (char **)0, ln, (bool)UTF))
55da9344 1069 && (norun || regtry(prog, s)) )
1aa99e6b 1070 goto got_it;
80aecb99
JH
1071 else {
1072 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1073 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1074 if ( f != c
1075 && (f == c1 || f == c2)
1076 && (ln == foldlen ||
66423254
JH
1077 !ibcmp_utf8((char *) foldbuf,
1078 (char **)0, foldlen, do_utf8,
d07ddd77 1079 m,
eb160463 1080 (char **)0, ln, (bool)UTF))
80aecb99
JH
1081 && (norun || regtry(prog, s)) )
1082 goto got_it;
1083 }
1aa99e6b
IH
1084 s += len;
1085 }
09091399
JH
1086 }
1087 else {
1aa99e6b 1088 while (s <= e) {
872c91ae
JH
1089 c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
1090 ckWARN(WARN_UTF8) ?
1091 0 : UTF8_ALLOW_ANY);
80aecb99 1092
60a8b682 1093 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1094 * Note that not all the possible combinations
1095 * are handled here: some of them are handled
1096 * by the standard folding rules, and some of
1097 * them (the character class or ANYOF cases)
1098 * are handled during compiletime in
1099 * regexec.c:S_regclass(). */
880bd946
JH
1100 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1101 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1102 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99
JH
1103
1104 if ( (c == c1 || c == c2)
1105 && (ln == len ||
66423254 1106 ibcmp_utf8(s, (char **)0, 0, do_utf8,
eb160463 1107 m, (char **)0, ln, (bool)UTF))
55da9344 1108 && (norun || regtry(prog, s)) )
1aa99e6b 1109 goto got_it;
80aecb99
JH
1110 else {
1111 uvchr_to_utf8(tmpbuf, c);
ac7e0132 1112 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
80aecb99
JH
1113 if ( f != c
1114 && (f == c1 || f == c2)
1115 && (ln == foldlen ||
a6872d42 1116 !ibcmp_utf8((char *) foldbuf,
66423254 1117 (char **)0, foldlen, do_utf8,
d07ddd77 1118 m,
eb160463 1119 (char **)0, ln, (bool)UTF))
80aecb99
JH
1120 && (norun || regtry(prog, s)) )
1121 goto got_it;
1122 }
1aa99e6b
IH
1123 s += len;
1124 }
09091399 1125 }
1aa99e6b
IH
1126 }
1127 else {
1128 if (c1 == c2)
1129 while (s <= e) {
1130 if ( *(U8*)s == c1
1131 && (ln == 1 || !(OP(c) == EXACTF
1132 ? ibcmp(s, m, ln)
1133 : ibcmp_locale(s, m, ln)))
1134 && (norun || regtry(prog, s)) )
1135 goto got_it;
1136 s++;
1137 }
1138 else
1139 while (s <= e) {
1140 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1141 && (ln == 1 || !(OP(c) == EXACTF
1142 ? ibcmp(s, m, ln)
1143 : ibcmp_locale(s, m, ln)))
1144 && (norun || regtry(prog, s)) )
1145 goto got_it;
1146 s++;
1147 }
b3c9acc1
IZ
1148 }
1149 break;
bbce6d69 1150 case BOUNDL:
3280af22 1151 PL_reg_flags |= RF_tainted;
bbce6d69 1152 /* FALL THROUGH */
a0d0e21e 1153 case BOUND:
ffc61ed2 1154 if (do_utf8) {
12d33761 1155 if (s == PL_bostr)
ffc61ed2
JH
1156 tmp = '\n';
1157 else {
b4f7163a 1158 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1159
b4f7163a 1160 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1161 }
1162 tmp = ((OP(c) == BOUND ?
9041c2e3 1163 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1164 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1165 while (s < strend) {
1166 if (tmp == !(OP(c) == BOUND ?
3568d838 1167 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1168 isALNUM_LC_utf8((U8*)s)))
1169 {
1170 tmp = !tmp;
1171 if ((norun || regtry(prog, s)))
1172 goto got_it;
1173 }
1174 s += UTF8SKIP(s);
a687059c 1175 }
a0d0e21e 1176 }
667bb95a 1177 else {
12d33761 1178 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1179 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1180 while (s < strend) {
1181 if (tmp ==
1182 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1183 tmp = !tmp;
1184 if ((norun || regtry(prog, s)))
1185 goto got_it;
1186 }
1187 s++;
a0ed51b3 1188 }
a0ed51b3 1189 }
6eb5f6b9 1190 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1191 goto got_it;
1192 break;
bbce6d69 1193 case NBOUNDL:
3280af22 1194 PL_reg_flags |= RF_tainted;
bbce6d69 1195 /* FALL THROUGH */
a0d0e21e 1196 case NBOUND:
ffc61ed2 1197 if (do_utf8) {
12d33761 1198 if (s == PL_bostr)
ffc61ed2
JH
1199 tmp = '\n';
1200 else {
b4f7163a 1201 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
9041c2e3 1202
b4f7163a 1203 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
1204 }
1205 tmp = ((OP(c) == NBOUND ?
9041c2e3 1206 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
8269fa76 1207 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2
JH
1208 while (s < strend) {
1209 if (tmp == !(OP(c) == NBOUND ?
3568d838 1210 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1211 isALNUM_LC_utf8((U8*)s)))
1212 tmp = !tmp;
1213 else if ((norun || regtry(prog, s)))
1214 goto got_it;
1215 s += UTF8SKIP(s);
1216 }
a0d0e21e 1217 }
667bb95a 1218 else {
12d33761 1219 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1220 tmp = ((OP(c) == NBOUND ?
1221 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1222 while (s < strend) {
1223 if (tmp ==
1224 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1225 tmp = !tmp;
1226 else if ((norun || regtry(prog, s)))
1227 goto got_it;
1228 s++;
1229 }
a0ed51b3 1230 }
6eb5f6b9 1231 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
a0ed51b3
LW
1232 goto got_it;
1233 break;
a0d0e21e 1234 case ALNUM:
ffc61ed2 1235 if (do_utf8) {
8269fa76 1236 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1237 while (s < strend) {
3568d838 1238 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1239 if (tmp && (norun || regtry(prog, s)))
1240 goto got_it;
1241 else
1242 tmp = doevery;
1243 }
bbce6d69 1244 else
ffc61ed2
JH
1245 tmp = 1;
1246 s += UTF8SKIP(s);
bbce6d69 1247 }
bbce6d69 1248 }
ffc61ed2
JH
1249 else {
1250 while (s < strend) {
1251 if (isALNUM(*s)) {
1252 if (tmp && (norun || regtry(prog, s)))
1253 goto got_it;
1254 else
1255 tmp = doevery;
1256 }
a0ed51b3 1257 else
ffc61ed2
JH
1258 tmp = 1;
1259 s++;
a0ed51b3 1260 }
a0ed51b3
LW
1261 }
1262 break;
bbce6d69 1263 case ALNUML:
3280af22 1264 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1265 if (do_utf8) {
1266 while (s < strend) {
1267 if (isALNUM_LC_utf8((U8*)s)) {
1268 if (tmp && (norun || regtry(prog, s)))
1269 goto got_it;
1270 else
1271 tmp = doevery;
1272 }
a687059c 1273 else
ffc61ed2
JH
1274 tmp = 1;
1275 s += UTF8SKIP(s);
a0d0e21e 1276 }
a0d0e21e 1277 }
ffc61ed2
JH
1278 else {
1279 while (s < strend) {
1280 if (isALNUM_LC(*s)) {
1281 if (tmp && (norun || regtry(prog, s)))
1282 goto got_it;
1283 else
1284 tmp = doevery;
1285 }
a0ed51b3 1286 else
ffc61ed2
JH
1287 tmp = 1;
1288 s++;
a0ed51b3 1289 }
a0ed51b3
LW
1290 }
1291 break;
a0d0e21e 1292 case NALNUM:
ffc61ed2 1293 if (do_utf8) {
8269fa76 1294 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 1295 while (s < strend) {
3568d838 1296 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
ffc61ed2
JH
1297 if (tmp && (norun || regtry(prog, s)))
1298 goto got_it;
1299 else
1300 tmp = doevery;
1301 }
bbce6d69 1302 else
ffc61ed2
JH
1303 tmp = 1;
1304 s += UTF8SKIP(s);
bbce6d69 1305 }
bbce6d69 1306 }
ffc61ed2
JH
1307 else {
1308 while (s < strend) {
1309 if (!isALNUM(*s)) {
1310 if (tmp && (norun || regtry(prog, s)))
1311 goto got_it;
1312 else
1313 tmp = doevery;
1314 }
a0ed51b3 1315 else
ffc61ed2
JH
1316 tmp = 1;
1317 s++;
a0ed51b3 1318 }
a0ed51b3
LW
1319 }
1320 break;
bbce6d69 1321 case NALNUML:
3280af22 1322 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1323 if (do_utf8) {
1324 while (s < strend) {
1325 if (!isALNUM_LC_utf8((U8*)s)) {
1326 if (tmp && (norun || regtry(prog, s)))
1327 goto got_it;
1328 else
1329 tmp = doevery;
1330 }
a687059c 1331 else
ffc61ed2
JH
1332 tmp = 1;
1333 s += UTF8SKIP(s);
a687059c 1334 }
a0d0e21e 1335 }
ffc61ed2
JH
1336 else {
1337 while (s < strend) {
1338 if (!isALNUM_LC(*s)) {
1339 if (tmp && (norun || regtry(prog, s)))
1340 goto got_it;
1341 else
1342 tmp = doevery;
1343 }
a0ed51b3 1344 else
ffc61ed2
JH
1345 tmp = 1;
1346 s++;
a0ed51b3 1347 }
a0ed51b3
LW
1348 }
1349 break;
a0d0e21e 1350 case SPACE:
ffc61ed2 1351 if (do_utf8) {
8269fa76 1352 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1353 while (s < strend) {
3568d838 1354 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
ffc61ed2
JH
1355 if (tmp && (norun || regtry(prog, s)))
1356 goto got_it;
1357 else
1358 tmp = doevery;
1359 }
a0d0e21e 1360 else
ffc61ed2
JH
1361 tmp = 1;
1362 s += UTF8SKIP(s);
2304df62 1363 }
a0d0e21e 1364 }
ffc61ed2
JH
1365 else {
1366 while (s < strend) {
1367 if (isSPACE(*s)) {
1368 if (tmp && (norun || regtry(prog, s)))
1369 goto got_it;
1370 else
1371 tmp = doevery;
1372 }
a0ed51b3 1373 else
ffc61ed2
JH
1374 tmp = 1;
1375 s++;
a0ed51b3 1376 }
a0ed51b3
LW
1377 }
1378 break;
bbce6d69 1379 case SPACEL:
3280af22 1380 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1381 if (do_utf8) {
1382 while (s < strend) {
1383 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1384 if (tmp && (norun || regtry(prog, s)))
1385 goto got_it;
1386 else
1387 tmp = doevery;
1388 }
bbce6d69 1389 else
ffc61ed2
JH
1390 tmp = 1;
1391 s += UTF8SKIP(s);
bbce6d69 1392 }
bbce6d69 1393 }
ffc61ed2
JH
1394 else {
1395 while (s < strend) {
1396 if (isSPACE_LC(*s)) {
1397 if (tmp && (norun || regtry(prog, s)))
1398 goto got_it;
1399 else
1400 tmp = doevery;
1401 }
a0ed51b3 1402 else
ffc61ed2
JH
1403 tmp = 1;
1404 s++;
a0ed51b3 1405 }
a0ed51b3
LW
1406 }
1407 break;
a0d0e21e 1408 case NSPACE:
ffc61ed2 1409 if (do_utf8) {
8269fa76 1410 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 1411 while (s < strend) {
3568d838 1412 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
ffc61ed2
JH
1413 if (tmp && (norun || regtry(prog, s)))
1414 goto got_it;
1415 else
1416 tmp = doevery;
1417 }
a0d0e21e 1418 else
ffc61ed2
JH
1419 tmp = 1;
1420 s += UTF8SKIP(s);
a687059c 1421 }
a0d0e21e 1422 }
ffc61ed2
JH
1423 else {
1424 while (s < strend) {
1425 if (!isSPACE(*s)) {
1426 if (tmp && (norun || regtry(prog, s)))
1427 goto got_it;
1428 else
1429 tmp = doevery;
1430 }
a0ed51b3 1431 else
ffc61ed2
JH
1432 tmp = 1;
1433 s++;
a0ed51b3 1434 }
a0ed51b3
LW
1435 }
1436 break;
bbce6d69 1437 case NSPACEL:
3280af22 1438 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1439 if (do_utf8) {
1440 while (s < strend) {
1441 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1442 if (tmp && (norun || regtry(prog, s)))
1443 goto got_it;
1444 else
1445 tmp = doevery;
1446 }
bbce6d69 1447 else
ffc61ed2
JH
1448 tmp = 1;
1449 s += UTF8SKIP(s);
bbce6d69 1450 }
bbce6d69 1451 }
ffc61ed2
JH
1452 else {
1453 while (s < strend) {
1454 if (!isSPACE_LC(*s)) {
1455 if (tmp && (norun || regtry(prog, s)))
1456 goto got_it;
1457 else
1458 tmp = doevery;
1459 }
a0ed51b3 1460 else
ffc61ed2
JH
1461 tmp = 1;
1462 s++;
a0ed51b3 1463 }
a0ed51b3
LW
1464 }
1465 break;
a0d0e21e 1466 case DIGIT:
ffc61ed2 1467 if (do_utf8) {
8269fa76 1468 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1469 while (s < strend) {
3568d838 1470 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1471 if (tmp && (norun || regtry(prog, s)))
1472 goto got_it;
1473 else
1474 tmp = doevery;
1475 }
a0d0e21e 1476 else
ffc61ed2
JH
1477 tmp = 1;
1478 s += UTF8SKIP(s);
2b69d0c2 1479 }
a0d0e21e 1480 }
ffc61ed2
JH
1481 else {
1482 while (s < strend) {
1483 if (isDIGIT(*s)) {
1484 if (tmp && (norun || regtry(prog, s)))
1485 goto got_it;
1486 else
1487 tmp = doevery;
1488 }
a0ed51b3 1489 else
ffc61ed2
JH
1490 tmp = 1;
1491 s++;
a0ed51b3 1492 }
a0ed51b3
LW
1493 }
1494 break;
b8c5462f
JH
1495 case DIGITL:
1496 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1497 if (do_utf8) {
1498 while (s < strend) {
1499 if (isDIGIT_LC_utf8((U8*)s)) {
1500 if (tmp && (norun || regtry(prog, s)))
1501 goto got_it;
1502 else
1503 tmp = doevery;
1504 }
b8c5462f 1505 else
ffc61ed2
JH
1506 tmp = 1;
1507 s += UTF8SKIP(s);
b8c5462f 1508 }
b8c5462f 1509 }
ffc61ed2
JH
1510 else {
1511 while (s < strend) {
1512 if (isDIGIT_LC(*s)) {
1513 if (tmp && (norun || regtry(prog, s)))
1514 goto got_it;
1515 else
1516 tmp = doevery;
1517 }
b8c5462f 1518 else
ffc61ed2
JH
1519 tmp = 1;
1520 s++;
b8c5462f 1521 }
b8c5462f
JH
1522 }
1523 break;
a0d0e21e 1524 case NDIGIT:
ffc61ed2 1525 if (do_utf8) {
8269fa76 1526 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 1527 while (s < strend) {
3568d838 1528 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
ffc61ed2
JH
1529 if (tmp && (norun || regtry(prog, s)))
1530 goto got_it;
1531 else
1532 tmp = doevery;
1533 }
a0d0e21e 1534 else
ffc61ed2
JH
1535 tmp = 1;
1536 s += UTF8SKIP(s);
a687059c 1537 }
a0d0e21e 1538 }
ffc61ed2
JH
1539 else {
1540 while (s < strend) {
1541 if (!isDIGIT(*s)) {
1542 if (tmp && (norun || regtry(prog, s)))
1543 goto got_it;
1544 else
1545 tmp = doevery;
1546 }
a0ed51b3 1547 else
ffc61ed2
JH
1548 tmp = 1;
1549 s++;
a0ed51b3 1550 }
a0ed51b3
LW
1551 }
1552 break;
b8c5462f
JH
1553 case NDIGITL:
1554 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
1555 if (do_utf8) {
1556 while (s < strend) {
1557 if (!isDIGIT_LC_utf8((U8*)s)) {
1558 if (tmp && (norun || regtry(prog, s)))
1559 goto got_it;
1560 else
1561 tmp = doevery;
1562 }
b8c5462f 1563 else
ffc61ed2
JH
1564 tmp = 1;
1565 s += UTF8SKIP(s);
b8c5462f 1566 }
a0ed51b3 1567 }
ffc61ed2
JH
1568 else {
1569 while (s < strend) {
1570 if (!isDIGIT_LC(*s)) {
1571 if (tmp && (norun || regtry(prog, s)))
1572 goto got_it;
1573 else
1574 tmp = doevery;
1575 }
cf93c79d 1576 else
ffc61ed2
JH
1577 tmp = 1;
1578 s++;
b8c5462f 1579 }
b8c5462f
JH
1580 }
1581 break;
b3c9acc1 1582 default:
3c3eec57
GS
1583 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1584 break;
d6a28714 1585 }
6eb5f6b9
JH
1586 return 0;
1587 got_it:
1588 return s;
1589}
1590
1591/*
1592 - regexec_flags - match a regexp against a string
1593 */
1594I32
1595Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1596 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1597/* strend: pointer to null at end of string */
1598/* strbeg: real beginning of string */
1599/* minend: end of match must be >=minend after stringarg. */
1600/* data: May be used for some additional optimizations. */
1601/* nosave: For optimizations. */
1602{
6eb5f6b9
JH
1603 register char *s;
1604 register regnode *c;
1605 register char *startpos = stringarg;
6eb5f6b9
JH
1606 I32 minlen; /* must match at least this many chars */
1607 I32 dontbother = 0; /* how many characters not to try at end */
155aba94 1608 /* I32 start_shift = 0; */ /* Offset of the start to find
6eb5f6b9
JH
1609 constant substr. */ /* CC */
1610 I32 end_shift = 0; /* Same for the end. */ /* CC */
1611 I32 scream_pos = -1; /* Internal iterator of scream. */
1612 char *scream_olds;
1613 SV* oreplsv = GvSV(PL_replgv);
1aa99e6b 1614 bool do_utf8 = DO_UTF8(sv);
2a782b5b 1615#ifdef DEBUGGING
9e55ce06
JH
1616 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1617 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2a782b5b 1618#endif
6eb5f6b9
JH
1619
1620 PL_regcc = 0;
1621
1622 cache_re(prog);
1623#ifdef DEBUGGING
aea4f609 1624 PL_regnarrate = DEBUG_r_TEST;
6eb5f6b9
JH
1625#endif
1626
1627 /* Be paranoid... */
1628 if (prog == NULL || startpos == NULL) {
1629 Perl_croak(aTHX_ "NULL regexp parameter");
1630 return 0;
1631 }
1632
1633 minlen = prog->minlen;
61a36c01 1634 if (strend - startpos < minlen) {
a72c7584
JH
1635 DEBUG_r(PerlIO_printf(Perl_debug_log,
1636 "String too short [regexec_flags]...\n"));
1637 goto phooey;
1aa99e6b 1638 }
6eb5f6b9 1639
6eb5f6b9
JH
1640 /* Check validity of program. */
1641 if (UCHARAT(prog->program) != REG_MAGIC) {
1642 Perl_croak(aTHX_ "corrupted regexp program");
1643 }
1644
1645 PL_reg_flags = 0;
1646 PL_reg_eval_set = 0;
1647 PL_reg_maxiter = 0;
1648
1649 if (prog->reganch & ROPT_UTF8)
1650 PL_reg_flags |= RF_utf8;
1651
1652 /* Mark beginning of line for ^ and lookbehind. */
1653 PL_regbol = startpos;
1654 PL_bostr = strbeg;
1655 PL_reg_sv = sv;
1656
1657 /* Mark end of line for $ (and such) */
1658 PL_regeol = strend;
1659
1660 /* see how far we have to get to not match where we matched before */
1661 PL_regtill = startpos+minend;
1662
1663 /* We start without call_cc context. */
1664 PL_reg_call_cc = 0;
1665
1666 /* If there is a "must appear" string, look for it. */
1667 s = startpos;
1668
1669 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1670 MAGIC *mg;
1671
1672 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1673 PL_reg_ganch = startpos;
1674 else if (sv && SvTYPE(sv) >= SVt_PVMG
1675 && SvMAGIC(sv)
14befaf4
DM
1676 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1677 && mg->mg_len >= 0) {
6eb5f6b9
JH
1678 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1679 if (prog->reganch & ROPT_ANCH_GPOS) {
f33976b4 1680 if (s > PL_reg_ganch)
6eb5f6b9
JH
1681 goto phooey;
1682 s = PL_reg_ganch;
1683 }
1684 }
1685 else /* pos() not defined */
1686 PL_reg_ganch = strbeg;
1687 }
1688
33b8afdf 1689 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
6eb5f6b9
JH
1690 re_scream_pos_data d;
1691
1692 d.scream_olds = &scream_olds;
1693 d.scream_pos = &scream_pos;
1694 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7
JH
1695 if (!s) {
1696 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1697 goto phooey; /* not present */
3fa9c3d7 1698 }
6eb5f6b9
JH
1699 }
1700
2a782b5b 1701 DEBUG_r({
9e55ce06
JH
1702 char *s0 = UTF ?
1703 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
c728cb41 1704 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1705 prog->precomp;
1706 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1707 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
c728cb41 1708 UNI_DISPLAY_REGEX) : startpos;
9e55ce06 1709 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
2a782b5b
JH
1710 if (!PL_colorset)
1711 reginitcolors();
1712 PerlIO_printf(Perl_debug_log,
9e55ce06 1713 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
2a782b5b 1714 PL_colors[4],PL_colors[5],PL_colors[0],
9e55ce06 1715 len0, len0, s0,
2a782b5b 1716 PL_colors[1],
9e55ce06 1717 len0 > 60 ? "..." : "",
2a782b5b 1718 PL_colors[0],
9e55ce06
JH
1719 (int)(len1 > 60 ? 60 : len1),
1720 s1, PL_colors[1],
1721 (len1 > 60 ? "..." : "")
2a782b5b
JH
1722 );
1723 });
6eb5f6b9
JH
1724
1725 /* Simplest case: anchored match need be tried only once. */
1726 /* [unless only anchor is BOL and multiline is set] */
1727 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1728 if (s == startpos && regtry(prog, startpos))
1729 goto got_it;
1730 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1731 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1732 {
1733 char *end;
1734
1735 if (minlen)
1736 dontbother = minlen - 1;
1aa99e6b 1737 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1738 /* for multiline we only have to try after newlines */
33b8afdf 1739 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1740 if (s == startpos)
1741 goto after_try;
1742 while (1) {
1743 if (regtry(prog, s))
1744 goto got_it;
1745 after_try:
1746 if (s >= end)
1747 goto phooey;
1748 if (prog->reganch & RE_USE_INTUIT) {
1749 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1750 if (!s)
1751 goto phooey;
1752 }
1753 else
1754 s++;
1755 }
1756 } else {
1757 if (s > startpos)
1758 s--;
1759 while (s < end) {
1760 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1761 if (regtry(prog, s))
1762 goto got_it;
1763 }
1764 }
1765 }
1766 }
1767 goto phooey;
1768 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1769 if (regtry(prog, PL_reg_ganch))
1770 goto got_it;
1771 goto phooey;
1772 }
1773
1774 /* Messy cases: unanchored match. */
33b8afdf 1775 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1776 /* we have /x+whatever/ */
1777 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1778 char ch;
bf93d4cc
GS
1779#ifdef DEBUGGING
1780 int did_match = 0;
1781#endif
33b8afdf
JH
1782 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1783 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1784 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1785
1aa99e6b 1786 if (do_utf8) {
6eb5f6b9
JH
1787 while (s < strend) {
1788 if (*s == ch) {
bf93d4cc 1789 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1790 if (regtry(prog, s)) goto got_it;
1791 s += UTF8SKIP(s);
1792 while (s < strend && *s == ch)
1793 s += UTF8SKIP(s);
1794 }
1795 s += UTF8SKIP(s);
1796 }
1797 }
1798 else {
1799 while (s < strend) {
1800 if (*s == ch) {
bf93d4cc 1801 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1802 if (regtry(prog, s)) goto got_it;
1803 s++;
1804 while (s < strend && *s == ch)
1805 s++;
1806 }
1807 s++;
1808 }
1809 }
b7953727 1810 DEBUG_r(if (!did_match)
bf93d4cc 1811 PerlIO_printf(Perl_debug_log,
b7953727
JH
1812 "Did not find anchored character...\n")
1813 );
6eb5f6b9
JH
1814 }
1815 /*SUPPRESS 560*/
33b8afdf
JH
1816 else if (prog->anchored_substr != Nullsv
1817 || prog->anchored_utf8 != Nullsv
1818 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1819 && prog->float_max_offset < strend - s)) {
1820 SV *must;
1821 I32 back_max;
1822 I32 back_min;
1823 char *last;
6eb5f6b9 1824 char *last1; /* Last position checked before */
bf93d4cc
GS
1825#ifdef DEBUGGING
1826 int did_match = 0;
1827#endif
33b8afdf
JH
1828 if (prog->anchored_substr || prog->anchored_utf8) {
1829 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1830 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1831 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1832 back_max = back_min = prog->anchored_offset;
1833 } else {
1834 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1835 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1836 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1837 back_max = prog->float_max_offset;
1838 back_min = prog->float_min_offset;
1839 }
1840 if (must == &PL_sv_undef)
1841 /* could not downgrade utf8 check substring, so must fail */
1842 goto phooey;
1843
1844 last = HOP3c(strend, /* Cannot start after this */
1845 -(I32)(CHR_SVLEN(must)
1846 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1847
1848 if (s > PL_bostr)
1849 last1 = HOPc(s, -1);
1850 else
1851 last1 = s - 1; /* bogus */
1852
1853 /* XXXX check_substr already used to find `s', can optimize if
1854 check_substr==must. */
1855 scream_pos = -1;
1856 dontbother = end_shift;
1857 strend = HOPc(strend, -dontbother);
1858 while ( (s <= last) &&
9041c2e3 1859 ((flags & REXEC_SCREAM)
1aa99e6b 1860 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1861 end_shift, &scream_pos, 0))
1aa99e6b 1862 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1863 (unsigned char*)strend, must,
6eb5f6b9 1864 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1865 /* we may be pointing at the wrong string */
1866 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1867 s = prog->subbeg + (s - SvPVX(sv));
bf93d4cc 1868 DEBUG_r( did_match = 1 );
6eb5f6b9
JH
1869 if (HOPc(s, -back_max) > last1) {
1870 last1 = HOPc(s, -back_min);
1871 s = HOPc(s, -back_max);
1872 }
1873 else {
1874 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1875
1876 last1 = HOPc(s, -back_min);
1877 s = t;
1878 }
1aa99e6b 1879 if (do_utf8) {
6eb5f6b9
JH
1880 while (s <= last1) {
1881 if (regtry(prog, s))
1882 goto got_it;
1883 s += UTF8SKIP(s);
1884 }
1885 }
1886 else {
1887 while (s <= last1) {
1888 if (regtry(prog, s))
1889 goto got_it;
1890 s++;
1891 }
1892 }
1893 }
b7953727
JH
1894 DEBUG_r(if (!did_match)
1895 PerlIO_printf(Perl_debug_log,
1896 "Did not find %s substr `%s%.*s%s'%s...\n",
33b8afdf 1897 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc
GS
1898 ? "anchored" : "floating"),
1899 PL_colors[0],
1900 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1901 SvPVX(must),
b7953727
JH
1902 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1903 );
6eb5f6b9
JH
1904 goto phooey;
1905 }
155aba94 1906 else if ((c = prog->regstclass)) {
f14c76ed
RGS
1907 if (minlen) {
1908 I32 op = (U8)OP(prog->regstclass);
66e933ab 1909 /* don't bother with what can't match */
f14c76ed
RGS
1910 if (PL_regkind[op] != EXACT && op != CANY)
1911 strend = HOPc(strend, -(minlen - 1));
1912 }
ffc61ed2
JH
1913 DEBUG_r({
1914 SV *prop = sv_newmortal();
9e55ce06
JH
1915 char *s0;
1916 char *s1;
1917 int len0;
1918 int len1;
1919
ffc61ed2 1920 regprop(prop, c);
9e55ce06
JH
1921 s0 = UTF ?
1922 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
c728cb41 1923 UNI_DISPLAY_REGEX) :
9e55ce06
JH
1924 SvPVX(prop);
1925 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1926 s1 = UTF ?
c728cb41 1927 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
9e55ce06
JH
1928 len1 = UTF ? SvCUR(dsv1) : strend - s;
1929 PerlIO_printf(Perl_debug_log,
1930 "Matching stclass `%*.*s' against `%*.*s'\n",
1931 len0, len0, s0,
1932 len1, len1, s1);
ffc61ed2 1933 });
6eb5f6b9
JH
1934 if (find_byclass(prog, c, s, strend, startpos, 0))
1935 goto got_it;
bf93d4cc 1936 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
d6a28714
JH
1937 }
1938 else {
1939 dontbother = 0;
33b8afdf
JH
1940 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1941 /* Trim the end. */
d6a28714 1942 char *last;
33b8afdf
JH
1943 SV* float_real;
1944
1945 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1946 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1947 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1948
1949 if (flags & REXEC_SCREAM) {
33b8afdf 1950 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1951 end_shift, &scream_pos, 1); /* last one */
1952 if (!last)
ffc61ed2 1953 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1954 /* we may be pointing at the wrong string */
1955 else if (RX_MATCH_COPIED(prog))
1956 s = prog->subbeg + (s - SvPVX(sv));
b8c5462f 1957 }
d6a28714
JH
1958 else {
1959 STRLEN len;
33b8afdf 1960 char *little = SvPV(float_real, len);
d6a28714 1961
33b8afdf 1962 if (SvTAIL(float_real)) {
d6a28714
JH
1963 if (memEQ(strend - len + 1, little, len - 1))
1964 last = strend - len + 1;
1965 else if (!PL_multiline)
9041c2e3 1966 last = memEQ(strend - len, little, len)
d6a28714 1967 ? strend - len : Nullch;
b8c5462f 1968 else
d6a28714
JH
1969 goto find_last;
1970 } else {
1971 find_last:
9041c2e3 1972 if (len)
d6a28714 1973 last = rninstr(s, strend, little, little + len);
b8c5462f 1974 else
d6a28714 1975 last = strend; /* matching `$' */
b8c5462f 1976 }
b8c5462f 1977 }
bf93d4cc
GS
1978 if (last == NULL) {
1979 DEBUG_r(PerlIO_printf(Perl_debug_log,
1980 "%sCan't trim the tail, match fails (should not happen)%s\n",
1981 PL_colors[4],PL_colors[5]));
1982 goto phooey; /* Should not happen! */
1983 }
d6a28714
JH
1984 dontbother = strend - last + prog->float_min_offset;
1985 }
1986 if (minlen && (dontbother < minlen))
1987 dontbother = minlen - 1;
1988 strend -= dontbother; /* this one's always in bytes! */
1989 /* We don't know much -- general case. */
1aa99e6b 1990 if (do_utf8) {
d6a28714
JH
1991 for (;;) {
1992 if (regtry(prog, s))
1993 goto got_it;
1994 if (s >= strend)
1995 break;
b8c5462f 1996 s += UTF8SKIP(s);
d6a28714
JH
1997 };
1998 }
1999 else {
2000 do {
2001 if (regtry(prog, s))
2002 goto got_it;
2003 } while (s++ < strend);
2004 }
2005 }
2006
2007 /* Failure. */
2008 goto phooey;
2009
2010got_it:
2011 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2012
2013 if (PL_reg_eval_set) {
2014 /* Preserve the current value of $^R */
2015 if (oreplsv != GvSV(PL_replgv))
2016 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2017 restored, the value remains
2018 the same. */
acfe0abc 2019 restore_pos(aTHX_ 0);
d6a28714
JH
2020 }
2021
2022 /* make sure $`, $&, $', and $digit will work later */
2023 if ( !(flags & REXEC_NOT_FIRST) ) {
2024 if (RX_MATCH_COPIED(prog)) {
2025 Safefree(prog->subbeg);
2026 RX_MATCH_COPIED_off(prog);
2027 }
2028 if (flags & REXEC_COPY_STR) {
2029 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2030
2031 s = savepvn(strbeg, i);
2032 prog->subbeg = s;
2033 prog->sublen = i;
2034 RX_MATCH_COPIED_on(prog);
2035 }
2036 else {
2037 prog->subbeg = strbeg;
2038 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2039 }
2040 }
9041c2e3 2041
d6a28714
JH
2042 return 1;
2043
2044phooey:
bf93d4cc
GS
2045 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2046 PL_colors[4],PL_colors[5]));
d6a28714 2047 if (PL_reg_eval_set)
acfe0abc 2048 restore_pos(aTHX_ 0);
d6a28714
JH
2049 return 0;
2050}
2051
2052/*
2053 - regtry - try match at specific point
2054 */
2055STATIC I32 /* 0 failure, 1 success */
2056S_regtry(pTHX_ regexp *prog, char *startpos)
2057{
d6a28714
JH
2058 register I32 i;
2059 register I32 *sp;
2060 register I32 *ep;
2061 CHECKPOINT lastcp;
2062
02db2b7b
IZ
2063#ifdef DEBUGGING
2064 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2065#endif
d6a28714
JH
2066 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2067 MAGIC *mg;
2068
2069 PL_reg_eval_set = RS_init;
2070 DEBUG_r(DEBUG_s(
b900a521
JH
2071 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2072 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 2073 ));
e8347627 2074 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
2075 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2076 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2077 SAVETMPS;
2078 /* Apparently this is not needed, judging by wantarray. */
e8347627 2079 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
2080 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2081
2082 if (PL_reg_sv) {
2083 /* Make $_ available to executed code. */
2084 if (PL_reg_sv != DEFSV) {
4d1ff10f 2085 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
d6a28714
JH
2086 SAVESPTR(DEFSV);
2087 DEFSV = PL_reg_sv;
b8c5462f 2088 }
d6a28714 2089
9041c2e3 2090 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
14befaf4 2091 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
d6a28714 2092 /* prepare for quick setting of pos */
14befaf4
DM
2093 sv_magic(PL_reg_sv, (SV*)0,
2094 PERL_MAGIC_regex_global, Nullch, 0);
2095 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
d6a28714 2096 mg->mg_len = -1;
b8c5462f 2097 }
d6a28714
JH
2098 PL_reg_magic = mg;
2099 PL_reg_oldpos = mg->mg_len;
c76ac1ee 2100 SAVEDESTRUCTOR_X(restore_pos, 0);
d6a28714 2101 }
09687e5a 2102 if (!PL_reg_curpm) {
0f79a09d 2103 Newz(22,PL_reg_curpm, 1, PMOP);
09687e5a
AB
2104#ifdef USE_ITHREADS
2105 {
2106 SV* repointer = newSViv(0);
577e12cc 2107 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 2108 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
2109 av_push(PL_regex_padav,repointer);
2110 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2111 PL_regex_pad = AvARRAY(PL_regex_padav);
2112 }
2113#endif
2114 }
aaa362c4 2115 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
2116 PL_reg_oldcurpm = PL_curpm;
2117 PL_curpm = PL_reg_curpm;
2118 if (RX_MATCH_COPIED(prog)) {
2119 /* Here is a serious problem: we cannot rewrite subbeg,
2120 since it may be needed if this match fails. Thus
2121 $` inside (?{}) could fail... */
2122 PL_reg_oldsaved = prog->subbeg;
2123 PL_reg_oldsavedlen = prog->sublen;
2124 RX_MATCH_COPIED_off(prog);
2125 }
2126 else
2127 PL_reg_oldsaved = Nullch;
2128 prog->subbeg = PL_bostr;
2129 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2130 }
2131 prog->startp[0] = startpos - PL_bostr;
2132 PL_reginput = startpos;
2133 PL_regstartp = prog->startp;
2134 PL_regendp = prog->endp;
2135 PL_reglastparen = &prog->lastparen;
a01268b5 2136 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2137 prog->lastparen = 0;
03994de8 2138 prog->lastcloseparen = 0;
d6a28714
JH
2139 PL_regsize = 0;
2140 DEBUG_r(PL_reg_starttry = startpos);
2141 if (PL_reg_start_tmpl <= prog->nparens) {
2142 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2143 if(PL_reg_start_tmp)
2144 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2145 else
2146 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2147 }
2148
2149 /* XXXX What this code is doing here?!!! There should be no need
2150 to do this again and again, PL_reglastparen should take care of
3dd2943c 2151 this! --ilya*/
dafc8851
JH
2152
2153 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2154 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2155 * PL_reglastparen), is not needed at all by the test suite
2156 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2157 * enough, for building DynaLoader, or otherwise this
2158 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2159 * will happen. Meanwhile, this code *is* needed for the
2160 * above-mentioned test suite tests to succeed. The common theme
2161 * on those tests seems to be returning null fields from matches.
2162 * --jhi */
dafc8851 2163#if 1
d6a28714
JH
2164 sp = prog->startp;
2165 ep = prog->endp;
2166 if (prog->nparens) {
eb160463 2167 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2168 *++sp = -1;
2169 *++ep = -1;
2170 }
2171 }
dafc8851 2172#endif
02db2b7b 2173 REGCP_SET(lastcp);
d6a28714
JH
2174 if (regmatch(prog->program + 1)) {
2175 prog->endp[0] = PL_reginput - PL_bostr;
2176 return 1;
2177 }
02db2b7b 2178 REGCP_UNWIND(lastcp);
d6a28714
JH
2179 return 0;
2180}
2181
02db2b7b
IZ
2182#define RE_UNWIND_BRANCH 1
2183#define RE_UNWIND_BRANCHJ 2
2184
2185union re_unwind_t;
2186
2187typedef struct { /* XX: makes sense to enlarge it... */
2188 I32 type;
2189 I32 prev;
2190 CHECKPOINT lastcp;
2191} re_unwind_generic_t;
2192
2193typedef struct {
2194 I32 type;
2195 I32 prev;
2196 CHECKPOINT lastcp;
2197 I32 lastparen;
2198 regnode *next;
2199 char *locinput;
2200 I32 nextchr;
2201#ifdef DEBUGGING
2202 int regindent;
2203#endif
2204} re_unwind_branch_t;
2205
2206typedef union re_unwind_t {
2207 I32 type;
2208 re_unwind_generic_t generic;
2209 re_unwind_branch_t branch;
2210} re_unwind_t;
2211
8ba1375e
MJD
2212#define sayYES goto yes
2213#define sayNO goto no
e0f9d4a8 2214#define sayNO_ANYOF goto no_anyof
8ba1375e
MJD
2215#define sayYES_FINAL goto yes_final
2216#define sayYES_LOUD goto yes_loud
2217#define sayNO_FINAL goto no_final
2218#define sayNO_SILENT goto do_no
2219#define saySAME(x) if (x) goto yes; else goto no
2220
2221#define REPORT_CODE_OFF 24
2222
d6a28714
JH
2223/*
2224 - regmatch - main matching routine
2225 *
2226 * Conceptually the strategy is simple: check to see whether the current
2227 * node matches, call self recursively to see whether the rest matches,
2228 * and then act accordingly. In practice we make some effort to avoid
2229 * recursion, in particular by going through "ordinary" nodes (that don't
2230 * need to know whether the rest of the match failed) by a loop instead of
2231 * by recursion.
2232 */
2233/* [lwall] I've hoisted the register declarations to the outer block in order to
2234 * maybe save a little bit of pushing and popping on the stack. It also takes
2235 * advantage of machines that use a register save mask on subroutine entry.
2236 */
2237STATIC I32 /* 0 failure, 1 success */
2238S_regmatch(pTHX_ regnode *prog)
2239{
d6a28714
JH
2240 register regnode *scan; /* Current node. */
2241 regnode *next; /* Next node. */
2242 regnode *inner; /* Next node in internal branch. */
2243 register I32 nextchr; /* renamed nextchr - nextchar colides with
2244 function of same name */
2245 register I32 n; /* no or next */
b7953727
JH
2246 register I32 ln = 0; /* len or last */
2247 register char *s = Nullch; /* operand or save */
d6a28714 2248 register char *locinput = PL_reginput;
b7953727 2249 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
d6a28714 2250 int minmod = 0, sw = 0, logical = 0;
02db2b7b 2251 I32 unwind = 0;
b7953727 2252#if 0
02db2b7b 2253 I32 firstcp = PL_savestack_ix;
b7953727 2254#endif
53c4c00c 2255 register bool do_utf8 = PL_reg_match_utf8;
2a782b5b 2256#ifdef DEBUGGING
ce333219
JH
2257 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2258 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2259 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2a782b5b 2260#endif
02db2b7b 2261
d6a28714
JH
2262#ifdef DEBUGGING
2263 PL_regindent++;
2264#endif
2265
2266 /* Note that nextchr is a byte even in UTF */
2267 nextchr = UCHARAT(locinput);
2268 scan = prog;
2269 while (scan != NULL) {
8ba1375e 2270
2a782b5b 2271 DEBUG_r( {
d6a28714
JH
2272 SV *prop = sv_newmortal();
2273 int docolor = *PL_colors[0];
2274 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
1aa99e6b 2275 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
d6a28714
JH
2276 /* The part of the string before starttry has one color
2277 (pref0_len chars), between starttry and current
2278 position another one (pref_len - pref0_len chars),
2279 after the current position the third one.
2280 We assume that pref0_len <= pref_len, otherwise we
2281 decrease pref0_len. */
9041c2e3 2282 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
1aa99e6b
IH
2283 ? (5 + taill) - l : locinput - PL_bostr;
2284 int pref0_len;
d6a28714 2285
df1ffd02 2286 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
1aa99e6b
IH
2287 pref_len++;
2288 pref0_len = pref_len - (locinput - PL_reg_starttry);
d6a28714 2289 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
9041c2e3 2290 l = ( PL_regeol - locinput > (5 + taill) - pref_len
d6a28714 2291 ? (5 + taill) - pref_len : PL_regeol - locinput);
df1ffd02 2292 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
1aa99e6b 2293 l--;
d6a28714
JH
2294 if (pref0_len < 0)
2295 pref0_len = 0;
2296 if (pref0_len > pref_len)
2297 pref0_len = pref_len;
2298 regprop(prop, scan);
2a782b5b
JH
2299 {
2300 char *s0 =
f14c76ed 2301 do_utf8 && OP(scan) != CANY ?
2a782b5b 2302 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
c728cb41 2303 pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2304 locinput - pref_len;
df1ffd02 2305 int len0 = do_utf8 ? strlen(s0) : pref0_len;
f14c76ed 2306 char *s1 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2307 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
c728cb41 2308 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2309 locinput - pref_len + pref0_len;
df1ffd02 2310 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
f14c76ed 2311 char *s2 = do_utf8 && OP(scan) != CANY ?
2a782b5b 2312 pv_uni_display(dsv2, (U8*)locinput,
c728cb41 2313 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2a782b5b 2314 locinput;
df1ffd02 2315 int len2 = do_utf8 ? strlen(s2) : l;
2a782b5b
JH
2316 PerlIO_printf(Perl_debug_log,
2317 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2318 (IV)(locinput - PL_bostr),
2319 PL_colors[4],
2320 len0, s0,
2321 PL_colors[5],
2322 PL_colors[2],
2323 len1, s1,
2324 PL_colors[3],
2325 (docolor ? "" : "> <"),
2326 PL_colors[0],
2327 len2, s2,
2328 PL_colors[1],
2329 15 - l - pref_len + 1,
2330 "",
2331 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2332 SvPVX(prop));
2333 }
2334 });
d6a28714
JH
2335
2336 next = scan + NEXT_OFF(scan);
2337 if (next == scan)
2338 next = NULL;
2339
2340 switch (OP(scan)) {
2341 case BOL:
12d33761
HS
2342 if (locinput == PL_bostr || (PL_multiline &&
2343 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
d6a28714
JH
2344 {
2345 /* regtill = regbol; */
b8c5462f
JH
2346 break;
2347 }
d6a28714
JH
2348 sayNO;
2349 case MBOL:
12d33761
HS
2350 if (locinput == PL_bostr ||
2351 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2352 {
b8c5462f
JH
2353 break;
2354 }
d6a28714
JH
2355 sayNO;
2356 case SBOL:
c2a73568 2357 if (locinput == PL_bostr)
b8c5462f 2358 break;
d6a28714
JH
2359 sayNO;
2360 case GPOS:
2361 if (locinput == PL_reg_ganch)
2362 break;
2363 sayNO;
2364 case EOL:
2365 if (PL_multiline)
2366 goto meol;
2367 else
2368 goto seol;
2369 case MEOL:
2370 meol:
2371 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2372 sayNO;
b8c5462f 2373 break;
d6a28714
JH
2374 case SEOL:
2375 seol:
2376 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2377 sayNO;
d6a28714 2378 if (PL_regeol - locinput > 1)
b8c5462f 2379 sayNO;
b8c5462f 2380 break;
d6a28714
JH
2381 case EOS:
2382 if (PL_regeol != locinput)
b8c5462f 2383 sayNO;
d6a28714 2384 break;
ffc61ed2 2385 case SANY:
d6a28714 2386 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2387 sayNO;
f33976b4
DB
2388 if (do_utf8) {
2389 locinput += PL_utf8skip[nextchr];
2390 if (locinput > PL_regeol)
2391 sayNO;
2392 nextchr = UCHARAT(locinput);
2393 }
2394 else
2395 nextchr = UCHARAT(++locinput);
2396 break;
2397 case CANY:
2398 if (!nextchr && locinput >= PL_regeol)
2399 sayNO;
b8c5462f 2400 nextchr = UCHARAT(++locinput);
a0d0e21e 2401 break;
ffc61ed2 2402 case REG_ANY:
1aa99e6b
IH
2403 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2404 sayNO;
2405 if (do_utf8) {
b8c5462f 2406 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2407 if (locinput > PL_regeol)
2408 sayNO;
a0ed51b3 2409 nextchr = UCHARAT(locinput);
a0ed51b3 2410 }
1aa99e6b
IH
2411 else
2412 nextchr = UCHARAT(++locinput);
a0ed51b3 2413 break;
d6a28714 2414 case EXACT:
cd439c50
IZ
2415 s = STRING(scan);
2416 ln = STR_LEN(scan);
eb160463 2417 if (do_utf8 != UTF) {
bc517b45 2418 /* The target and the pattern have differing utf8ness. */
1aa99e6b
IH
2419 char *l = locinput;
2420 char *e = s + ln;
bc517b45 2421 STRLEN ulen;
a72c7584 2422
5ff6fc6d
JH
2423 if (do_utf8) {
2424 /* The target is utf8, the pattern is not utf8. */
1aa99e6b
IH
2425 while (s < e) {
2426 if (l >= PL_regeol)
5ff6fc6d
JH
2427 sayNO;
2428 if (NATIVE_TO_UNI(*(U8*)s) !=
872c91ae
JH
2429 utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
2430 ckWARN(WARN_UTF8) ?
2431 0 : UTF8_ALLOW_ANY))
5ff6fc6d 2432 sayNO;
bc517b45 2433 l += ulen;
5ff6fc6d 2434 s ++;
1aa99e6b 2435 }
5ff6fc6d
JH
2436 }
2437 else {
2438 /* The target is not utf8, the pattern is utf8. */
1aa99e6b
IH
2439 while (s < e) {
2440 if (l >= PL_regeol)
2441 sayNO;
5ff6fc6d 2442 if (NATIVE_TO_UNI(*((U8*)l)) !=
872c91ae
JH
2443 utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
2444 ckWARN(WARN_UTF8) ?
2445 0 : UTF8_ALLOW_ANY))
1aa99e6b 2446 sayNO;
bc517b45 2447 s += ulen;
a72c7584 2448 l ++;
1aa99e6b 2449 }
5ff6fc6d 2450 }
1aa99e6b
IH
2451 locinput = l;
2452 nextchr = UCHARAT(locinput);
2453 break;
2454 }
bc517b45 2455 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2456 /* Inline the first character, for speed. */
2457 if (UCHARAT(s) != nextchr)
2458 sayNO;
2459 if (PL_regeol - locinput < ln)
2460 sayNO;
2461 if (ln > 1 && memNE(s, locinput, ln))
2462 sayNO;
2463 locinput += ln;
2464 nextchr = UCHARAT(locinput);
2465 break;
2466 case EXACTFL:
b8c5462f
JH
2467 PL_reg_flags |= RF_tainted;
2468 /* FALL THROUGH */
d6a28714 2469 case EXACTF:
cd439c50
IZ
2470 s = STRING(scan);
2471 ln = STR_LEN(scan);
d6a28714 2472
d07ddd77
JH
2473 if (do_utf8 || UTF) {
2474 /* Either target or the pattern are utf8. */
d6a28714 2475 char *l = locinput;
d07ddd77 2476 char *e = PL_regeol;
bc517b45 2477
eb160463 2478 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
1feea2c7 2479 l, &e, 0, do_utf8)) {
5486206c
JH
2480 /* One more case for the sharp s:
2481 * pack("U0U*", 0xDF) =~ /ss/i,
2482 * the 0xC3 0x9F are the UTF-8
2483 * byte sequence for the U+00DF. */
2484 if (!(do_utf8 &&
2485 toLOWER(s[0]) == 's' &&
2486 ln >= 2 &&
2487 toLOWER(s[1]) == 's' &&
2488 (U8)l[0] == 0xC3 &&
2489 e - l >= 2 &&
2490 (U8)l[1] == 0x9F))
2491 sayNO;
2492 }
d07ddd77
JH
2493 locinput = e;
2494 nextchr = UCHARAT(locinput);
2495 break;
a0ed51b3 2496 }
d6a28714 2497
bc517b45
JH
2498 /* Neither the target and the pattern are utf8. */
2499
d6a28714
JH
2500 /* Inline the first character, for speed. */
2501 if (UCHARAT(s) != nextchr &&
2502 UCHARAT(s) != ((OP(scan) == EXACTF)
2503 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2504 sayNO;
d6a28714 2505 if (PL_regeol - locinput < ln)
b8c5462f 2506 sayNO;
d6a28714
JH
2507 if (ln > 1 && (OP(scan) == EXACTF
2508 ? ibcmp(s, locinput, ln)
2509 : ibcmp_locale(s, locinput, ln)))
4633a7c4 2510 sayNO;
d6a28714
JH
2511 locinput += ln;
2512 nextchr = UCHARAT(locinput);
a0d0e21e 2513 break;
d6a28714 2514 case ANYOF:
ffc61ed2 2515 if (do_utf8) {
9e55ce06
JH
2516 STRLEN inclasslen = PL_regeol - locinput;
2517
ba7b4546 2518 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2519 sayNO_ANYOF;
ffc61ed2
JH
2520 if (locinput >= PL_regeol)
2521 sayNO;
0f0076b4 2522 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2523 nextchr = UCHARAT(locinput);
e0f9d4a8 2524 break;
ffc61ed2
JH
2525 }
2526 else {
2527 if (nextchr < 0)
2528 nextchr = UCHARAT(locinput);
7d3e948e 2529 if (!REGINCLASS(scan, (U8*)locinput))
e0f9d4a8 2530 sayNO_ANYOF;
ffc61ed2
JH
2531 if (!nextchr && locinput >= PL_regeol)
2532 sayNO;
2533 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2534 break;
2535 }
2536 no_anyof:
2537 /* If we might have the case of the German sharp s
2538 * in a casefolding Unicode character class. */
2539
ebc501f0
JH
2540 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2541 locinput += SHARP_S_SKIP;
e0f9d4a8 2542 nextchr = UCHARAT(locinput);
ffc61ed2 2543 }
e0f9d4a8
JH
2544 else
2545 sayNO;
b8c5462f 2546 break;
d6a28714 2547 case ALNUML:
b8c5462f
JH
2548 PL_reg_flags |= RF_tainted;
2549 /* FALL THROUGH */
d6a28714 2550 case ALNUM:
b8c5462f 2551 if (!nextchr)
4633a7c4 2552 sayNO;
ffc61ed2 2553 if (do_utf8) {
ad24be35 2554 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2555 if (!(OP(scan) == ALNUM
3568d838 2556 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2557 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2558 {
2559 sayNO;
a0ed51b3 2560 }
b8c5462f 2561 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2562 nextchr = UCHARAT(locinput);
2563 break;
2564 }
ffc61ed2 2565 if (!(OP(scan) == ALNUM
d6a28714 2566 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2567 sayNO;
b8c5462f 2568 nextchr = UCHARAT(++locinput);
a0d0e21e 2569 break;
d6a28714 2570 case NALNUML:
b8c5462f
JH
2571 PL_reg_flags |= RF_tainted;
2572 /* FALL THROUGH */
d6a28714
JH
2573 case NALNUM:
2574 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2575 sayNO;
ffc61ed2 2576 if (do_utf8) {
8269fa76 2577 LOAD_UTF8_CHARCLASS(alnum,"a");
ffc61ed2 2578 if (OP(scan) == NALNUM
3568d838 2579 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2580 : isALNUM_LC_utf8((U8*)locinput))
2581 {
b8c5462f 2582 sayNO;
d6a28714 2583 }
b8c5462f
JH
2584 locinput += PL_utf8skip[nextchr];
2585 nextchr = UCHARAT(locinput);
2586 break;
2587 }
ffc61ed2 2588 if (OP(scan) == NALNUM
d6a28714 2589 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2590 sayNO;
76e3520e 2591 nextchr = UCHARAT(++locinput);
a0d0e21e 2592 break;
d6a28714
JH
2593 case BOUNDL:
2594 case NBOUNDL:
3280af22 2595 PL_reg_flags |= RF_tainted;
bbce6d69 2596 /* FALL THROUGH */
d6a28714
JH
2597 case BOUND:
2598 case NBOUND:
2599 /* was last char in word? */
ffc61ed2 2600 if (do_utf8) {
12d33761
HS
2601 if (locinput == PL_bostr)
2602 ln = '\n';
ffc61ed2 2603 else {
b4f7163a 2604 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2605
b4f7163a 2606 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
ffc61ed2
JH
2607 }
2608 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2609 ln = isALNUM_uni(ln);
8269fa76 2610 LOAD_UTF8_CHARCLASS(alnum,"a");
3568d838 2611 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2612 }
2613 else {
9041c2e3 2614 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
ffc61ed2
JH
2615 n = isALNUM_LC_utf8((U8*)locinput);
2616 }
a0ed51b3 2617 }
d6a28714 2618 else {
12d33761
HS
2619 ln = (locinput != PL_bostr) ?
2620 UCHARAT(locinput - 1) : '\n';
ffc61ed2
JH
2621 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2622 ln = isALNUM(ln);
2623 n = isALNUM(nextchr);
2624 }
2625 else {
2626 ln = isALNUM_LC(ln);
2627 n = isALNUM_LC(nextchr);
2628 }
d6a28714 2629 }
ffc61ed2
JH
2630 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2631 OP(scan) == BOUNDL))
2632 sayNO;
a0ed51b3 2633 break;
d6a28714 2634 case SPACEL:
3280af22 2635 PL_reg_flags |= RF_tainted;
bbce6d69 2636 /* FALL THROUGH */
d6a28714 2637 case SPACE:
9442cb0e 2638 if (!nextchr)
4633a7c4 2639 sayNO;
1aa99e6b 2640 if (do_utf8) {
fd400ab9 2641 if (UTF8_IS_CONTINUED(nextchr)) {
8269fa76 2642 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2643 if (!(OP(scan) == SPACE
3568d838 2644 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2645 : isSPACE_LC_utf8((U8*)locinput)))
2646 {
2647 sayNO;
2648 }
2649 locinput += PL_utf8skip[nextchr];
2650 nextchr = UCHARAT(locinput);
2651 break;
d6a28714 2652 }
ffc61ed2
JH
2653 if (!(OP(scan) == SPACE
2654 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2655 sayNO;
2656 nextchr = UCHARAT(++locinput);
2657 }
2658 else {
2659 if (!(OP(scan) == SPACE
2660 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2661 sayNO;
2662 nextchr = UCHARAT(++locinput);
a0ed51b3 2663 }
a0ed51b3 2664 break;
d6a28714 2665 case NSPACEL:
3280af22 2666 PL_reg_flags |= RF_tainted;
bbce6d69 2667 /* FALL THROUGH */
d6a28714 2668 case NSPACE:
9442cb0e 2669 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2670 sayNO;
1aa99e6b 2671 if (do_utf8) {
8269fa76 2672 LOAD_UTF8_CHARCLASS(space," ");
ffc61ed2 2673 if (OP(scan) == NSPACE
3568d838 2674 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 2675 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
2676 {
2677 sayNO;
2678 }
2679 locinput += PL_utf8skip[nextchr];
2680 nextchr = UCHARAT(locinput);
2681 break;
a0ed51b3 2682 }
ffc61ed2 2683 if (OP(scan) == NSPACE
d6a28714 2684 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 2685 sayNO;
76e3520e 2686 nextchr = UCHARAT(++locinput);
a0d0e21e 2687 break;
d6a28714 2688 case DIGITL:
a0ed51b3
LW
2689 PL_reg_flags |= RF_tainted;
2690 /* FALL THROUGH */
d6a28714 2691 case DIGIT:
9442cb0e 2692 if (!nextchr)
a0ed51b3 2693 sayNO;
1aa99e6b 2694 if (do_utf8) {
8269fa76 2695 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2696 if (!(OP(scan) == DIGIT
3568d838 2697 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 2698 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 2699 {
a0ed51b3 2700 sayNO;
dfe13c55 2701 }
6f06b55f 2702 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2703 nextchr = UCHARAT(locinput);
2704 break;
2705 }
ffc61ed2 2706 if (!(OP(scan) == DIGIT
9442cb0e 2707 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
2708 sayNO;
2709 nextchr = UCHARAT(++locinput);
2710 break;
d6a28714 2711 case NDIGITL:
b8c5462f
JH
2712 PL_reg_flags |= RF_tainted;
2713 /* FALL THROUGH */
d6a28714 2714 case NDIGIT:
9442cb0e 2715 if (!nextchr && locinput >= PL_regeol)
b8c5462f 2716 sayNO;
1aa99e6b 2717 if (do_utf8) {
8269fa76 2718 LOAD_UTF8_CHARCLASS(digit,"0");
ffc61ed2 2719 if (OP(scan) == NDIGIT
3568d838 2720 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
2721 : isDIGIT_LC_utf8((U8*)locinput))
2722 {
a0ed51b3 2723 sayNO;
9442cb0e 2724 }
6f06b55f 2725 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2726 nextchr = UCHARAT(locinput);
2727 break;
2728 }
ffc61ed2 2729 if (OP(scan) == NDIGIT
9442cb0e 2730 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
2731 sayNO;
2732 nextchr = UCHARAT(++locinput);
2733 break;
2734 case CLUMP:
b7c83a7e 2735 if (locinput >= PL_regeol)
a0ed51b3 2736 sayNO;
b7c83a7e
JH
2737 if (do_utf8) {
2738 LOAD_UTF8_CHARCLASS(mark,"~");
2739 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2740 sayNO;
2741 locinput += PL_utf8skip[nextchr];
2742 while (locinput < PL_regeol &&
2743 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2744 locinput += UTF8SKIP(locinput);
2745 if (locinput > PL_regeol)
2746 sayNO;
eb08e2da
JH
2747 }
2748 else
2749 locinput++;
a0ed51b3
LW
2750 nextchr = UCHARAT(locinput);
2751 break;
c8756f30 2752 case REFFL:
3280af22 2753 PL_reg_flags |= RF_tainted;
c8756f30 2754 /* FALL THROUGH */
c277df42 2755 case REF:
c8756f30 2756 case REFF:
c277df42 2757 n = ARG(scan); /* which paren pair */
cf93c79d 2758 ln = PL_regstartp[n];
2c2d71f5 2759 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
eb160463 2760 if ((I32)*PL_reglastparen < n || ln == -1)
af3f8c16 2761 sayNO; /* Do not match unless seen CLOSEn. */
cf93c79d 2762 if (ln == PL_regendp[n])
a0d0e21e 2763 break;
a0ed51b3 2764
cf93c79d 2765 s = PL_bostr + ln;
1aa99e6b 2766 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 2767 char *l = locinput;
cf93c79d 2768 char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
2769 /*
2770 * Note that we can't do the "other character" lookup trick as
2771 * in the 8-bit case (no pun intended) because in Unicode we
2772 * have to map both upper and title case to lower case.
2773 */
2774 if (OP(scan) == REFF) {
a2a2844f 2775 STRLEN ulen1, ulen2;
e7ae6809
JH
2776 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
2777 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a0ed51b3
LW
2778 while (s < e) {
2779 if (l >= PL_regeol)
2780 sayNO;
a2a2844f
JH
2781 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2782 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 2783 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 2784 sayNO;
a2a2844f
JH
2785 s += ulen1;
2786 l += ulen2;
a0ed51b3
LW
2787 }
2788 }
2789 locinput = l;
2790 nextchr = UCHARAT(locinput);
2791 break;
2792 }
2793
a0d0e21e 2794 /* Inline the first character, for speed. */
76e3520e 2795 if (UCHARAT(s) != nextchr &&
c8756f30
AK
2796 (OP(scan) == REF ||
2797 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 2798 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 2799 sayNO;
cf93c79d 2800 ln = PL_regendp[n] - ln;
3280af22 2801 if (locinput + ln > PL_regeol)
4633a7c4 2802 sayNO;
c8756f30
AK
2803 if (ln > 1 && (OP(scan) == REF
2804 ? memNE(s, locinput, ln)
2805 : (OP(scan) == REFF
2806 ? ibcmp(s, locinput, ln)
2807 : ibcmp_locale(s, locinput, ln))))
4633a7c4 2808 sayNO;
a0d0e21e 2809 locinput += ln;
76e3520e 2810 nextchr = UCHARAT(locinput);
a0d0e21e
LW
2811 break;
2812
2813 case NOTHING:
c277df42 2814 case TAIL:
a0d0e21e
LW
2815 break;
2816 case BACK:
2817 break;
c277df42
IZ
2818 case EVAL:
2819 {
2820 dSP;
533c011a 2821 OP_4tree *oop = PL_op;
3280af22 2822 COP *ocurcop = PL_curcop;
f3548bdc 2823 PAD *old_comppad;
c277df42 2824 SV *ret;
9041c2e3 2825
c277df42 2826 n = ARG(scan);
533c011a 2827 PL_op = (OP_4tree*)PL_regdata->data[n];
d7d93a81 2828 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
f3548bdc 2829 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
cf93c79d 2830 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
c277df42 2831
8e5e9ebe
RGS
2832 {
2833 SV **before = SP;
2834 CALLRUNOPS(aTHX); /* Scalar context. */
2835 SPAGAIN;
2836 if (SP == before)
075aa684 2837 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
2838 else {
2839 ret = POPs;
2840 PUTBACK;
2841 }
2842 }
2843
0f5d15d6 2844 PL_op = oop;
f3548bdc 2845 PAD_RESTORE_LOCAL(old_comppad);
0f5d15d6 2846 PL_curcop = ocurcop;
c277df42 2847 if (logical) {
0f5d15d6
IZ
2848 if (logical == 2) { /* Postponed subexpression. */
2849 regexp *re;
22c35a8c 2850 MAGIC *mg = Null(MAGIC*);
0f5d15d6 2851 re_cc_state state;
0f5d15d6 2852 CHECKPOINT cp, lastcp;
cb50f42d 2853 int toggleutf;
0f5d15d6
IZ
2854
2855 if(SvROK(ret) || SvRMAGICAL(ret)) {
2856 SV *sv = SvROK(ret) ? SvRV(ret) : ret;
2857
2858 if(SvMAGICAL(sv))
14befaf4 2859 mg = mg_find(sv, PERL_MAGIC_qr);
0f5d15d6
IZ
2860 }
2861 if (mg) {
2862 re = (regexp *)mg->mg_obj;
df0003d4 2863 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
2864 }
2865 else {
2866 STRLEN len;
2867 char *t = SvPV(ret, len);
2868 PMOP pm;
2869 char *oprecomp = PL_regprecomp;
2870 I32 osize = PL_regsize;
2871 I32 onpar = PL_regnpar;
2872
5fcd1c1b 2873 Zero(&pm, 1, PMOP);
cb50f42d 2874 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
cea2e8a9 2875 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
9041c2e3 2876 if (!(SvFLAGS(ret)
0f5d15d6 2877 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
14befaf4
DM
2878 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2879 PERL_MAGIC_qr,0,0);
0f5d15d6
IZ
2880 PL_regprecomp = oprecomp;
2881 PL_regsize = osize;
2882 PL_regnpar = onpar;
2883 }
2884 DEBUG_r(
9041c2e3 2885 PerlIO_printf(Perl_debug_log,
0f5d15d6
IZ
2886 "Entering embedded `%s%.60s%s%s'\n",
2887 PL_colors[0],
2888 re->precomp,
2889 PL_colors[1],
2890 (strlen(re->precomp) > 60 ? "..." : ""))
2891 );
2892 state.node = next;
2893 state.prev = PL_reg_call_cc;
2894 state.cc = PL_regcc;
2895 state.re = PL_reg_re;
2896
2ab05381 2897 PL_regcc = 0;
9041c2e3 2898
0f5d15d6 2899 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 2900 REGCP_SET(lastcp);
0f5d15d6
IZ
2901 cache_re(re);
2902 state.ss = PL_savestack_ix;
2903 *PL_reglastparen = 0;
a01268b5 2904 *PL_reglastcloseparen = 0;
0f5d15d6
IZ
2905 PL_reg_call_cc = &state;
2906 PL_reginput = locinput;
cb50f42d
YST
2907 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2908 ((re->reganch & ROPT_UTF8) != 0);
2909 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2910
2911 /* XXXX This is too dramatic a measure... */
2912 PL_reg_maxiter = 0;
2913
0f5d15d6 2914 if (regmatch(re->program + 1)) {
2c914db6
IZ
2915 /* Even though we succeeded, we need to restore
2916 global variables, since we may be wrapped inside
2917 SUSPEND, thus the match may be not finished yet. */
2918
2919 /* XXXX Do this only if SUSPENDed? */
2920 PL_reg_call_cc = state.prev;
2921 PL_regcc = state.cc;
2922 PL_reg_re = state.re;
2923 cache_re(PL_reg_re);
cb50f42d 2924 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c914db6
IZ
2925
2926 /* XXXX This is too dramatic a measure... */
2927 PL_reg_maxiter = 0;
2928
2929 /* These are needed even if not SUSPEND. */
0f5d15d6
IZ
2930 ReREFCNT_dec(re);
2931 regcpblow(cp);
2932 sayYES;
2933 }
0f5d15d6 2934 ReREFCNT_dec(re);
02db2b7b 2935 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
2936 regcppop();
2937 PL_reg_call_cc = state.prev;
2938 PL_regcc = state.cc;
2939 PL_reg_re = state.re;
d3790889 2940 cache_re(PL_reg_re);
cb50f42d 2941 if (toggleutf) PL_reg_flags ^= RF_utf8;
2c2d71f5
JH
2942
2943 /* XXXX This is too dramatic a measure... */
2944 PL_reg_maxiter = 0;
2945
8e514ae6 2946 logical = 0;
0f5d15d6
IZ
2947 sayNO;
2948 }
c277df42 2949 sw = SvTRUE(ret);
0f5d15d6 2950 logical = 0;
a0ed51b3
LW
2951 }
2952 else
3280af22 2953 sv_setsv(save_scalar(PL_replgv), ret);
c277df42
IZ
2954 break;
2955 }
a0d0e21e 2956 case OPEN:
c277df42 2957 n = ARG(scan); /* which paren pair */
3280af22
NIS
2958 PL_reg_start_tmp[n] = locinput;
2959 if (n > PL_regsize)
2960 PL_regsize = n;
a0d0e21e
LW
2961 break;
2962 case CLOSE:
c277df42 2963 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
2964 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2965 PL_regendp[n] = locinput - PL_bostr;
eb160463 2966 if (n > (I32)*PL_reglastparen)
3280af22 2967 *PL_reglastparen = n;
a01268b5 2968 *PL_reglastcloseparen = n;
a0d0e21e 2969 break;
c277df42
IZ
2970 case GROUPP:
2971 n = ARG(scan); /* which paren pair */
eb160463 2972 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
2973 break;
2974 case IFTHEN:
2c2d71f5 2975 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
c277df42
IZ
2976 if (sw)
2977 next = NEXTOPER(NEXTOPER(scan));
2978 else {
2979 next = scan + ARG(scan);
2980 if (OP(next) == IFTHEN) /* Fake one. */
2981 next = NEXTOPER(NEXTOPER(next));
2982 }
2983 break;
2984 case LOGICAL:
0f5d15d6 2985 logical = scan->flags;
c277df42 2986 break;
2ab05381
IZ
2987/*******************************************************************
2988 PL_regcc contains infoblock about the innermost (...)* loop, and
2989 a pointer to the next outer infoblock.
2990
2991 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2992
2993 1) After matching X, regnode for CURLYX is processed;
2994
9041c2e3 2995 2) This regnode creates infoblock on the stack, and calls
2ab05381
IZ
2996 regmatch() recursively with the starting point at WHILEM node;
2997
2998 3) Each hit of WHILEM node tries to match A and Z (in the order
2999 depending on the current iteration, min/max of {min,max} and
3000 greediness). The information about where are nodes for "A"
3001 and "Z" is read from the infoblock, as is info on how many times "A"
3002 was already matched, and greediness.
3003
3004 4) After A matches, the same WHILEM node is hit again.
3005
3006 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3007 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3008 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3009 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3010 of the external loop.
3011
3012 Currently present infoblocks form a tree with a stem formed by PL_curcc
3013 and whatever it mentions via ->next, and additional attached trees
3014 corresponding to temporarily unset infoblocks as in "5" above.
3015
9041c2e3 3016 In the following picture infoblocks for outer loop of
2ab05381
IZ
3017 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3018 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3019 infoblocks are drawn below the "reset" infoblock.
3020
3021 In fact in the picture below we do not show failed matches for Z and T
3022 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3023 more obvious *why* one needs to *temporary* unset infoblocks.]
3024
3025 Matched REx position InfoBlocks Comment
3026 (Y(A)*?Z)*?T x
3027 Y(A)*?Z)*?T x <- O
3028 Y (A)*?Z)*?T x <- O
3029 Y A)*?Z)*?T x <- O <- I
3030 YA )*?Z)*?T x <- O <- I
3031 YA A)*?Z)*?T x <- O <- I
3032 YAA )*?Z)*?T x <- O <- I
3033 YAA Z)*?T x <- O # Temporary unset I
3034 I
3035
3036 YAAZ Y(A)*?Z)*?T x <- O
3037 I
3038
3039 YAAZY (A)*?Z)*?T x <- O
3040 I
3041
3042 YAAZY A)*?Z)*?T x <- O <- I
3043 I
3044
3045 YAAZYA )*?Z)*?T x <- O <- I
3046 I
3047
3048 YAAZYA Z)*?T x <- O # Temporary unset I
3049 I,I
3050
3051 YAAZYAZ )*?T x <- O
3052 I,I
3053
3054 YAAZYAZ T x # Temporary unset O
3055 O
3056 I,I
3057
3058 YAAZYAZT x
3059 O
3060 I,I
3061 *******************************************************************/
a0d0e21e
LW
3062 case CURLYX: {
3063 CURCUR cc;
3280af22 3064 CHECKPOINT cp = PL_savestack_ix;
cb434fcc
IZ
3065 /* No need to save/restore up to this paren */
3066 I32 parenfloor = scan->flags;
c277df42
IZ
3067
3068 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3069 next += ARG(next);
3280af22
NIS
3070 cc.oldcc = PL_regcc;
3071 PL_regcc = &cc;
cb434fcc
IZ
3072 /* XXXX Probably it is better to teach regpush to support
3073 parenfloor > PL_regsize... */
eb160463 3074 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc
IZ
3075 parenfloor = *PL_reglastparen; /* Pessimization... */
3076 cc.parenfloor = parenfloor;
a0d0e21e
LW
3077 cc.cur = -1;
3078 cc.min = ARG1(scan);
3079 cc.max = ARG2(scan);
c277df42 3080 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
a0d0e21e
LW
3081 cc.next = next;
3082 cc.minmod = minmod;
3083 cc.lastloc = 0;
3280af22 3084 PL_reginput = locinput;
a0d0e21e
LW
3085 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3086 regcpblow(cp);
3280af22 3087 PL_regcc = cc.oldcc;
4633a7c4 3088 saySAME(n);
a0d0e21e
LW
3089 }
3090 /* NOT REACHED */
3091 case WHILEM: {
3092 /*
3093 * This is really hard to understand, because after we match
3094 * what we're trying to match, we must make sure the rest of
2c2d71f5 3095 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3096 * to go back UP the parse tree by recursing ever deeper. And
3097 * if it fails, we have to reset our parent's current state
3098 * that we can try again after backing off.
3099 */
3100
c277df42 3101 CHECKPOINT cp, lastcp;
3280af22 3102 CURCUR* cc = PL_regcc;
c277df42
IZ
3103 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3104
4633a7c4 3105 n = cc->cur + 1; /* how many we know we matched */
3280af22 3106 PL_reginput = locinput;
a0d0e21e 3107
c277df42 3108 DEBUG_r(
9041c2e3 3109 PerlIO_printf(Perl_debug_log,
91f3b821 3110 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3280af22 3111 REPORT_CODE_OFF+PL_regindent*2, "",
9041c2e3 3112 (long)n, (long)cc->min,
2797576d 3113 (long)cc->max, PTR2UV(cc))
c277df42 3114 );
4633a7c4 3115
a0d0e21e
LW
3116 /* If degenerate scan matches "", assume scan done. */
3117
579cf2c3 3118 if (locinput == cc->lastloc && n >= cc->min) {
3280af22 3119 PL_regcc = cc->oldcc;
2ab05381
IZ
3120 if (PL_regcc)
3121 ln = PL_regcc->cur;
c277df42 3122 DEBUG_r(
c3464db5
DD
3123 PerlIO_printf(Perl_debug_log,
3124 "%*s empty match detected, try continuation...\n",
3280af22 3125 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3126 );
a0d0e21e 3127 if (regmatch(cc->next))
4633a7c4 3128 sayYES;
2ab05381
IZ
3129 if (PL_regcc)
3130 PL_regcc->cur = ln;
3280af22 3131 PL_regcc = cc;
4633a7c4 3132 sayNO;
a0d0e21e
LW
3133 }
3134
3135 /* First just match a string of min scans. */
3136
3137 if (n < cc->min) {
3138 cc->cur = n;
3139 cc->lastloc = locinput;
4633a7c4
LW
3140 if (regmatch(cc->scan))
3141 sayYES;
3142 cc->cur = n - 1;
c277df42 3143 cc->lastloc = lastloc;
4633a7c4 3144 sayNO;
a0d0e21e
LW
3145 }
3146
2c2d71f5
JH
3147 if (scan->flags) {
3148 /* Check whether we already were at this position.
3149 Postpone detection until we know the match is not
3150 *that* much linear. */
3151 if (!PL_reg_maxiter) {
3152 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3153 PL_reg_leftiter = PL_reg_maxiter;
3154 }
3155 if (PL_reg_leftiter-- == 0) {
3156 I32 size = (PL_reg_maxiter + 7)/8;
3157 if (PL_reg_poscache) {
eb160463 3158 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
3159 Renew(PL_reg_poscache, size, char);
3160 PL_reg_poscache_size = size;
3161 }
3162 Zero(PL_reg_poscache, size, char);
3163 }
3164 else {
3165 PL_reg_poscache_size = size;
3166 Newz(29, PL_reg_poscache, size, char);
3167 }
3168 DEBUG_r(
3169 PerlIO_printf(Perl_debug_log,
3170 "%sDetected a super-linear match, switching on caching%s...\n",
3171 PL_colors[4], PL_colors[5])
3172 );
3173 }
3174 if (PL_reg_leftiter < 0) {
3175 I32 o = locinput - PL_bostr, b;
3176
3177 o = (scan->flags & 0xf) - 1 + o * (scan->flags>>4);
3178 b = o % 8;
3179 o /= 8;
3180 if (PL_reg_poscache[o] & (1<<b)) {
3181 DEBUG_r(
3182 PerlIO_printf(Perl_debug_log,
3183 "%*s already tried at this position...\n",
3184 REPORT_CODE_OFF+PL_regindent*2, "")
3185 );
7821416a 3186 sayNO_SILENT;
2c2d71f5
JH
3187 }
3188 PL_reg_poscache[o] |= (1<<b);
3189 }
3190 }
3191
a0d0e21e
LW
3192 /* Prefer next over scan for minimal matching. */
3193
3194 if (cc->minmod) {
3280af22 3195 PL_regcc = cc->oldcc;
2ab05381
IZ
3196 if (PL_regcc)
3197 ln = PL_regcc->cur;
5f05dabc 3198 cp = regcppush(cc->parenfloor);
02db2b7b 3199 REGCP_SET(lastcp);
5f05dabc 3200 if (regmatch(cc->next)) {
c277df42 3201 regcpblow(cp);
4633a7c4 3202 sayYES; /* All done. */
5f05dabc 3203 }
02db2b7b 3204 REGCP_UNWIND(lastcp);
5f05dabc 3205 regcppop();
2ab05381
IZ
3206 if (PL_regcc)
3207 PL_regcc->cur = ln;
3280af22 3208 PL_regcc = cc;
a0d0e21e 3209
c277df42 3210 if (n >= cc->max) { /* Maximum greed exceeded? */
9041c2e3 3211 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3280af22
NIS
3212 && !(PL_reg_flags & RF_warned)) {
3213 PL_reg_flags |= RF_warned;
9014280d 3214 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
2f3ca594
GS
3215 "Complex regular subexpression recursion",
3216 REG_INFTY - 1);
c277df42 3217 }
4633a7c4 3218 sayNO;
c277df42 3219 }
a687059c 3220
c277df42 3221 DEBUG_r(
c3464db5
DD
3222 PerlIO_printf(Perl_debug_log,
3223 "%*s trying longer...\n",
3280af22 3224 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42 3225 );
a0d0e21e 3226 /* Try scanning more and see if it helps. */
3280af22 3227 PL_reginput = locinput;
a0d0e21e
LW
3228 cc->cur = n;
3229 cc->lastloc = locinput;
5f05dabc 3230 cp = regcppush(cc->parenfloor);
02db2b7b 3231 REGCP_SET(lastcp);
5f05dabc 3232 if (regmatch(cc->scan)) {
c277df42 3233 regcpblow(cp);
4633a7c4 3234 sayYES;
5f05dabc 3235 }
02db2b7b 3236 REGCP_UNWIND(lastcp);
5f05dabc 3237 regcppop();
4633a7c4 3238 cc->cur = n - 1;
c277df42 3239 cc->lastloc = lastloc;
4633a7c4 3240 sayNO;
a0d0e21e
LW
3241 }
3242
3243 /* Prefer scan over next for maximal matching. */
3244
3245 if (n < cc->max) { /* More greed allowed? */
5f05dabc 3246 cp = regcppush(cc->parenfloor);
a0d0e21e
LW
3247 cc->cur = n;
3248 cc->lastloc = locinput;
02db2b7b 3249 REGCP_SET(lastcp);
5f05dabc 3250 if (regmatch(cc->scan)) {
c277df42 3251 regcpblow(cp);
4633a7c4 3252 sayYES;
5f05dabc 3253 }
02db2b7b 3254 REGCP_UNWIND(lastcp);
a0d0e21e 3255 regcppop(); /* Restore some previous $<digit>s? */
3280af22 3256 PL_reginput = locinput;
c277df42 3257 DEBUG_r(
c3464db5
DD
3258 PerlIO_printf(Perl_debug_log,
3259 "%*s failed, try continuation...\n",
3280af22 3260 REPORT_CODE_OFF+PL_regindent*2, "")
c277df42
IZ
3261 );
3262 }
9041c2e3 3263 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
599cee73 3264 && !(PL_reg_flags & RF_warned)) {
3280af22 3265 PL_reg_flags |= RF_warned;
9014280d 3266 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
cb5d145d
GS
3267 "Complex regular subexpression recursion",
3268 REG_INFTY - 1);
a0d0e21e
LW
3269 }
3270
3271 /* Failed deeper matches of scan, so see if this one works. */
3280af22 3272 PL_regcc = cc->oldcc;
2ab05381
IZ
3273 if (PL_regcc)
3274 ln = PL_regcc->cur;
a0d0e21e 3275 if (regmatch(cc->next))
4633a7c4 3276 sayYES;
2ab05381
IZ
3277 if (PL_regcc)
3278 PL_regcc->cur = ln;
3280af22 3279 PL_regcc = cc;
4633a7c4 3280 cc->cur = n - 1;
c277df42 3281 cc->lastloc = lastloc;
4633a7c4 3282 sayNO;
a0d0e21e
LW
3283 }
3284 /* NOT REACHED */
9041c2e3 3285 case BRANCHJ:
c277df42
IZ
3286 next = scan + ARG(scan);
3287 if (next == scan)
3288 next = NULL;
3289 inner = NEXTOPER(NEXTOPER(scan));
3290 goto do_branch;
9041c2e3 3291 case BRANCH:
c277df42
IZ
3292 inner = NEXTOPER(scan);
3293 do_branch:
3294 {
c277df42
IZ
3295 c1 = OP(scan);
3296 if (OP(next) != c1) /* No choice. */
3297 next = inner; /* Avoid recursion. */
a0d0e21e 3298 else {
02db2b7b
IZ
3299 I32 lastparen = *PL_reglastparen;
3300 I32 unwind1;
3301 re_unwind_branch_t *uw;
3302
3303 /* Put unwinding data on stack */
3304 unwind1 = SSNEWt(1,re_unwind_branch_t);
3305 uw = SSPTRt(unwind1,re_unwind_branch_t);
3306 uw->prev = unwind;
3307 unwind = unwind1;
3308 uw->type = ((c1 == BRANCH)
3309 ? RE_UNWIND_BRANCH
3310 : RE_UNWIND_BRANCHJ);
3311 uw->lastparen = lastparen;
3312 uw->next = next;
3313 uw->locinput = locinput;
3314 uw->nextchr = nextchr;
3315#ifdef DEBUGGING
3316 uw->regindent = ++PL_regindent;
3317#endif
c277df42 3318
02db2b7b
IZ
3319 REGCP_SET(uw->lastcp);
3320
3321 /* Now go into the first branch */
3322 next = inner;
a687059c 3323 }
a0d0e21e
LW
3324 }
3325 break;
3326 case MINMOD:
3327 minmod = 1;
3328 break;
c277df42
IZ
3329 case CURLYM:
3330 {
00db4c45 3331 I32 l = 0;
c277df42 3332 CHECKPOINT lastcp;
9041c2e3 3333
c277df42
IZ
3334 /* We suppose that the next guy does not need
3335 backtracking: in particular, it is of constant length,
3336 and has no parenths to influence future backrefs. */
3337 ln = ARG1(scan); /* min to match */
3338 n = ARG2(scan); /* max to match */
c277df42
IZ
3339 paren = scan->flags;
3340 if (paren) {
3280af22
NIS
3341 if (paren > PL_regsize)
3342 PL_regsize = paren;
eb160463 3343 if (paren > (I32)*PL_reglastparen)
3280af22 3344 *PL_reglastparen = paren;
c277df42 3345 }
dc45a647 3346 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42
IZ
3347 if (paren)
3348 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3280af22 3349 PL_reginput = locinput;
c277df42
IZ
3350 if (minmod) {
3351 minmod = 0;
3352 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3353 sayNO;
f31a99c8
HS
3354 /* if we matched something zero-length we don't need to
3355 backtrack - capturing parens are already defined, so
3356 the caveat in the maximal case doesn't apply
3357
3358 XXXX if ln == 0, we can redo this check first time
3359 through the following loop
3360 */
3361 if (ln && l == 0)
3362 n = ln; /* don't backtrack */
3280af22 3363 locinput = PL_reginput;
cca55fe3 3364 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3365 regnode *text_node = next;
3366
cca55fe3 3367 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3368
cca55fe3 3369 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3370 else {
cca55fe3
JP
3371 if (PL_regkind[(U8)OP(text_node)] == REF) {
3372 I32 n, ln;
3373 n = ARG(text_node); /* which paren pair */
3374 ln = PL_regstartp[n];
3375 /* assume yes if we haven't seen CLOSEn */
3376 if (
eb160463 3377 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3378 ln == -1 ||
3379 ln == PL_regendp[n]
3380 ) {
3381 c1 = c2 = -1000;
3382 goto assume_ok_MM;
3383 }
3384 c1 = *(PL_bostr + ln);
3385 }
3386 else { c1 = (U8)*STRING(text_node); }
af5decee 3387 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3388 c2 = PL_fold[c1];
af5decee 3389 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3390 c2 = PL_fold_locale[c1];
3391 else
3392 c2 = c1;
3393 }
a0ed51b3
LW
3394 }
3395 else
c277df42 3396 c1 = c2 = -1000;
cca55fe3 3397 assume_ok_MM:
02db2b7b 3398 REGCP_SET(lastcp);
5f4b28b2 3399 /* This may be improved if l == 0. */
c277df42
IZ
3400 while (n >= ln || (n == REG_INFTY && ln > 0 && l)) { /* ln overflow ? */
3401 /* If it could work, try it. */
3402 if (c1 == -1000 ||
3280af22
NIS
3403 UCHARAT(PL_reginput) == c1 ||
3404 UCHARAT(PL_reginput) == c2)
c277df42
IZ
3405 {
3406 if (paren) {
f31a99c8 3407 if (ln) {
cf93c79d
IZ
3408 PL_regstartp[paren] =
3409 HOPc(PL_reginput, -l) - PL_bostr;
3410 PL_regendp[paren] = PL_reginput - PL_bostr;
a0ed51b3
LW
3411 }
3412 else
cf93c79d 3413 PL_regendp[paren] = -1;
c277df42
IZ
3414 }
3415 if (regmatch(next))
3416 sayYES;
02db2b7b 3417 REGCP_UNWIND(lastcp);
c277df42
IZ
3418 }
3419 /* Couldn't or didn't -- move forward. */
3280af22 3420 PL_reginput = locinput;
c277df42
IZ
3421 if (regrepeat_hard(scan, 1, &l)) {
3422 ln++;
3280af22 3423 locinput = PL_reginput;
c277df42
IZ
3424 }
3425 else
3426 sayNO;
3427 }
a0ed51b3
LW
3428 }
3429 else {
c277df42 3430 n = regrepeat_hard(scan, n, &l);
f31a99c8
HS
3431 /* if we matched something zero-length we don't need to
3432 backtrack, unless the minimum count is zero and we
3433 are capturing the result - in that case the capture
3434 being defined or not may affect later execution
3435 */
3436 if (n != 0 && l == 0 && !(paren && ln == 0))
3437 ln = n; /* don't backtrack */
3280af22 3438 locinput = PL_reginput;
c277df42 3439 DEBUG_r(
5c0ca799 3440 PerlIO_printf(Perl_debug_log,
faccc32b 3441 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
7b0972df 3442 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
faccc32b 3443 (IV) n, (IV)l)
c277df42
IZ
3444 );
3445 if (n >= ln) {
cca55fe3 3446 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3447 regnode *text_node = next;
3448
cca55fe3 3449 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3450
cca55fe3 3451 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3452 else {
cca55fe3
JP
3453 if (PL_regkind[(U8)OP(text_node)] == REF) {
3454 I32 n, ln;
3455 n = ARG(text_node); /* which paren pair */
3456 ln = PL_regstartp[n];
3457 /* assume yes if we haven't seen CLOSEn */
3458 if (
eb160463 3459 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3460 ln == -1 ||
3461 ln == PL_regendp[n]
3462 ) {
3463 c1 = c2 = -1000;
3464 goto assume_ok_REG;
3465 }
3466 c1 = *(PL_bostr + ln);
3467 }
3468 else { c1 = (U8)*STRING(text_node); }
3469
af5decee 3470 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3471 c2 = PL_fold[c1];
af5decee 3472 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf
JP
3473 c2 = PL_fold_locale[c1];
3474 else
3475 c2 = c1;
3476 }
a0ed51b3
LW
3477 }
3478 else
c277df42
IZ
3479 c1 = c2 = -1000;
3480 }
cca55fe3 3481 assume_ok_REG:
02db2b7b 3482 REGCP_SET(lastcp);
c277df42
IZ
3483 while (n >= ln) {
3484 /* If it could work, try it. */
3485 if (c1 == -1000 ||
3280af22
NIS
3486 UCHARAT(PL_reginput) == c1 ||
3487 UCHARAT(PL_reginput) == c2)
a0ed51b3
LW
3488 {
3489 DEBUG_r(
c3464db5 3490 PerlIO_printf(Perl_debug_log,
7b0972df
JH
3491 "%*s trying tail with n=%"IVdf"...\n",
3492 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
a0ed51b3
LW
3493 );
3494 if (paren) {
3495 if (n) {
cf93c79d
IZ
3496 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3497 PL_regendp[paren] = PL_reginput - PL_bostr;
c277df42 3498 }
a0ed51b3 3499 else
cf93c79d 3500 PL_regendp[paren] = -1;
c277df42 3501 }
a0ed51b3
LW
3502 if (regmatch(next))
3503 sayYES;
02db2b7b 3504 REGCP_UNWIND(lastcp);
a0ed51b3 3505 }
c277df42
IZ
3506 /* Couldn't or didn't -- back up. */
3507 n--;
dfe13c55 3508 locinput = HOPc(locinput, -l);
3280af22 3509 PL_reginput = locinput;
c277df42
IZ
3510 }
3511 }
3512 sayNO;
3513 break;
3514 }
3515 case CURLYN:
3516 paren = scan->flags; /* Which paren to set */
3280af22
NIS
3517 if (paren > PL_regsize)
3518 PL_regsize = paren;
eb160463 3519 if (paren > (I32)*PL_reglastparen)
3280af22 3520 *PL_reglastparen = paren;
c277df42
IZ
3521 ln = ARG1(scan); /* min to match */
3522 n = ARG2(scan); /* max to match */
dc45a647 3523 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
c277df42 3524 goto repeat;
a0d0e21e 3525 case CURLY:
c277df42 3526 paren = 0;
a0d0e21e
LW
3527 ln = ARG1(scan); /* min to match */
3528 n = ARG2(scan); /* max to match */
dc45a647 3529 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
a0d0e21e
LW
3530 goto repeat;
3531 case STAR:
3532 ln = 0;
c277df42 3533 n = REG_INFTY;
a0d0e21e 3534 scan = NEXTOPER(scan);
c277df42 3535 paren = 0;
a0d0e21e
LW
3536 goto repeat;
3537 case PLUS:
c277df42
IZ
3538 ln = 1;
3539 n = REG_INFTY;
3540 scan = NEXTOPER(scan);
3541 paren = 0;
3542 repeat:
a0d0e21e
LW
3543 /*
3544 * Lookahead to avoid useless match attempts
3545 * when we know what character comes next.
3546 */
5f80c4cf
JP
3547
3548 /*
3549 * Used to only do .*x and .*?x, but now it allows
3550 * for )'s, ('s and (?{ ... })'s to be in the way
3551 * of the quantifier and the EXACT-like node. -- japhy
3552 */
3553
cca55fe3 3554 if (HAS_TEXT(next) || JUMPABLE(next)) {
5f80c4cf
JP
3555 U8 *s;
3556 regnode *text_node = next;
3557
cca55fe3 3558 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
5f80c4cf 3559
cca55fe3 3560 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
5f80c4cf 3561 else {
cca55fe3
JP
3562 if (PL_regkind[(U8)OP(text_node)] == REF) {
3563 I32 n, ln;
3564 n = ARG(text_node); /* which paren pair */
3565 ln = PL_regstartp[n];
3566 /* assume yes if we haven't seen CLOSEn */
3567 if (
eb160463 3568 (I32)*PL_reglastparen < n ||
cca55fe3
JP
3569 ln == -1 ||
3570 ln == PL_regendp[n]
3571 ) {
3572 c1 = c2 = -1000;
3573 goto assume_ok_easy;
3574 }
9246c65e 3575 s = (U8*)PL_bostr + ln;
cca55fe3
JP
3576 }
3577 else { s = (U8*)STRING(text_node); }
5f80c4cf
JP
3578
3579 if (!UTF) {
3580 c2 = c1 = *s;
f65d3ee7 3581 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
5f80c4cf 3582 c2 = PL_fold[c1];
f65d3ee7 3583 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
5f80c4cf 3584 c2 = PL_fold_locale[c1];
1aa99e6b 3585 }
5f80c4cf 3586 else { /* UTF */
f65d3ee7 3587 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
a2a2844f 3588 STRLEN ulen1, ulen2;
e7ae6809
JH
3589 U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
3590 U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
a2a2844f
JH
3591
3592 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3593 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3594
872c91ae
JH
3595 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
3596 ckWARN(WARN_UTF8) ?
3597 0 : UTF8_ALLOW_ANY);
3598 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
3599 ckWARN(WARN_UTF8) ?
3600 0 : UTF8_ALLOW_ANY);
5f80c4cf
JP
3601 }
3602 else {
872c91ae
JH
3603 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
3604 ckWARN(WARN_UTF8) ?
3605 0 : UTF8_ALLOW_ANY);
5f80c4cf 3606 }
1aa99e6b
IH
3607 }
3608 }
bbce6d69 3609 }
a0d0e21e 3610 else
bbce6d69 3611 c1 = c2 = -1000;
cca55fe3 3612 assume_ok_easy:
3280af22 3613 PL_reginput = locinput;
a0d0e21e 3614 if (minmod) {
c277df42 3615 CHECKPOINT lastcp;
a0d0e21e
LW
3616 minmod = 0;
3617 if (ln && regrepeat(scan, ln) < ln)
4633a7c4 3618 sayNO;
a0ed51b3 3619 locinput = PL_reginput;
02db2b7b 3620 REGCP_SET(lastcp);
0fe9bf95 3621 if (c1 != -1000) {
1aa99e6b 3622 char *e; /* Should not check after this */
0fe9bf95 3623 char *old = locinput;
b2f2f093 3624 int count = 0;
0fe9bf95 3625
1aa99e6b 3626 if (n == REG_INFTY) {
0fe9bf95 3627 e = PL_regeol - 1;
1aa99e6b
IH
3628 if (do_utf8)
3629 while (UTF8_IS_CONTINUATION(*(U8*)e))
3630 e--;
3631 }
3632 else if (do_utf8) {
3633 int m = n - ln;
3634 for (e = locinput;
3635 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3636 e += UTF8SKIP(e);
3637 }
3638 else {
3639 e = locinput + n - ln;
3640 if (e >= PL_regeol)
3641 e = PL_regeol - 1;
3642 }
0fe9bf95
IZ
3643 while (1) {
3644 /* Find place 'next' could work */
1aa99e6b
IH
3645 if (!do_utf8) {
3646 if (c1 == c2) {
a8e8ab15
JH
3647 while (locinput <= e &&
3648 UCHARAT(locinput) != c1)
1aa99e6b
IH
3649 locinput++;
3650 } else {
9041c2e3 3651 while (locinput <= e
a8e8ab15
JH
3652 && UCHARAT(locinput) != c1
3653 && UCHARAT(locinput) != c2)
1aa99e6b
IH
3654 locinput++;
3655 }
3656 count = locinput - old;
3657 }
3658 else {
3659 STRLEN len;
3660 if (c1 == c2) {
872c91ae
JH
3661 /* count initialised to
3662 * utf8_distance(old, locinput) */
b2f2f093 3663 while (locinput <= e &&
872c91ae
JH
3664 utf8n_to_uvchr((U8*)locinput,
3665 UTF8_MAXLEN, &len,
3666 ckWARN(WARN_UTF8) ?
eb160463 3667 0 : UTF8_ALLOW_ANY) != (UV)c1) {
1aa99e6b 3668 locinput += len;
b2f2f093
JH
3669 count++;
3670 }
1aa99e6b 3671 } else {
872c91ae
JH
3672 /* count initialised to
3673 * utf8_distance(old, locinput) */
b2f2f093 3674 while (locinput <= e) {
872c91ae
JH
3675 UV c = utf8n_to_uvchr((U8*)locinput,
3676 UTF8_MAXLEN, &len,
3677 ckWARN(WARN_UTF8) ?
3678 0 : UTF8_ALLOW_ANY);
eb160463 3679 if (c == (UV)c1 || c == (UV)c2)
1aa99e6b 3680 break;
b2f2f093
JH
3681 locinput += len;
3682 count++;
1aa99e6b
IH
3683 }
3684 }
0fe9bf95 3685 }
9041c2e3 3686 if (locinput > e)
0fe9bf95
IZ
3687 sayNO;
3688 /* PL_reginput == old now */
3689 if (locinput != old) {
3690 ln = 1; /* Did some */
1aa99e6b 3691 if (regrepeat(scan, count) < count)
0fe9bf95
IZ
3692 sayNO;
3693 }
3694 /* PL_reginput == locinput now */
29d1e993 3695 TRYPAREN(paren, ln, locinput);
0fe9bf95 3696 PL_reginput = locinput; /* Could be reset... */
02db2b7b 3697 REGCP_UNWIND(lastcp);
0fe9bf95 3698 /* Couldn't or didn't -- move forward. */
1aa99e6b
IH
3699 old = locinput;
3700 if (do_utf8)
3701 locinput += UTF8SKIP(locinput);
3702 else
3703 locinput++;
b2f2f093 3704 count = 1;
0fe9bf95
IZ
3705 }
3706 }
3707 else
c277df42 3708 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
1aa99e6b
IH
3709 UV c;
3710 if (c1 != -1000) {
3711 if (do_utf8)
872c91ae
JH
3712 c = utf8n_to_uvchr((U8*)PL_reginput,
3713 UTF8_MAXLEN, 0,
3714 ckWARN(WARN_UTF8) ?
3715 0 : UTF8_ALLOW_ANY);
1aa99e6b 3716 else
9041c2e3 3717 c = UCHARAT(PL_reginput);
2390ecbc 3718 /* If it could work, try it. */
eb160463 3719 if (c == (UV)c1 || c == (UV)c2)
2390ecbc 3720 {
ecc99935 3721 TRYPAREN(paren, ln, PL_reginput);
2390ecbc
PP
3722 REGCP_UNWIND(lastcp);
3723 }
1aa99e6b 3724 }
a0d0e21e 3725 /* If it could work, try it. */
2390ecbc 3726 else if (c1 == -1000)
bbce6d69 3727 {
ecc99935 3728 TRYPAREN(paren, ln, PL_reginput);
02db2b7b 3729 REGCP_UNWIND(lastcp);
bbce6d69 3730 }
c277df42 3731 /* Couldn't or didn't -- move forward. */
a0ed51b3 3732 PL_reginput = locinput;
a0d0e21e
LW
3733 if (regrepeat(scan, 1)) {
3734 ln++;
a0ed51b3
LW
3735 locinput = PL_reginput;
3736 }
3737 else
4633a7c4 3738 sayNO;
a0d0e21e
LW
3739 }
3740 }
3741 else {
c277df42 3742 CHECKPOINT lastcp;
a0d0e21e 3743 n = regrepeat(scan, n);
a0ed51b3 3744 locinput = PL_reginput;
22c35a8c 3745 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
15272685
HS
3746 ((!PL_multiline && OP(next) != MEOL) ||
3747 OP(next) == SEOL || OP(next) == EOS))
3748 {
a0d0e21e 3749 ln = n; /* why back off? */
1aeab75a
GS
3750 /* ...because $ and \Z can match before *and* after
3751 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3752 We should back off by one in this case. */
3753 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3754 ln--;
3755 }
02db2b7b 3756 REGCP_SET(lastcp);
c277df42 3757 if (paren) {
8fa7f367 3758 UV c = 0;
c277df42 3759 while (n >= ln) {
1aa99e6b
IH
3760 if (c1 != -1000) {
3761 if (do_utf8)
872c91ae
JH
3762 c = utf8n_to_uvchr((U8*)PL_reginput,
3763 UTF8_MAXLEN, 0,
3764 ckWARN(WARN_UTF8) ?
3765 0 : UTF8_ALLOW_ANY);
1aa99e6b 3766 else
9041c2e3 3767 c = UCHARAT(PL_reginput);
1aa99e6b 3768 }
c277df42 3769 /* If it could work, try it. */
eb160463 3770 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3771 {
29d1e993 3772 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3773 REGCP_UNWIND(lastcp);
c277df42
IZ
3774 }
3775 /* Couldn't or didn't -- back up. */
3776 n--;
dfe13c55 3777 PL_reginput = locinput = HOPc(locinput, -1);
c277df42 3778 }
a0ed51b3
LW
3779 }
3780 else {
8fa7f367 3781 UV c = 0;
c277df42 3782 while (n >= ln) {
1aa99e6b
IH
3783 if (c1 != -1000) {
3784 if (do_utf8)
872c91ae
JH
3785 c = utf8n_to_uvchr((U8*)PL_reginput,
3786 UTF8_MAXLEN, 0,
3787 ckWARN(WARN_UTF8) ?
3788 0 : UTF8_ALLOW_ANY);
1aa99e6b 3789 else
9041c2e3 3790 c = UCHARAT(PL_reginput);
1aa99e6b 3791 }
c277df42 3792 /* If it could work, try it. */
eb160463 3793 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
c277df42 3794 {
29d1e993 3795 TRYPAREN(paren, n, PL_reginput);
02db2b7b 3796 REGCP_UNWIND(lastcp);
c277df42
IZ
3797 }
3798 /* Couldn't or didn't -- back up. */
3799 n--;
dfe13c55 3800 PL_reginput = locinput = HOPc(locinput, -1);
bbce6d69 3801 }
a0d0e21e
LW
3802 }
3803 }
4633a7c4 3804 sayNO;
c277df42 3805 break;
a0d0e21e 3806 case END:
0f5d15d6
IZ
3807 if (PL_reg_call_cc) {
3808 re_cc_state *cur_call_cc = PL_reg_call_cc;
3809 CURCUR *cctmp = PL_regcc;
3810 regexp *re = PL_reg_re;
3811 CHECKPOINT cp, lastcp;
3812
3813 cp = regcppush(0); /* Save *all* the positions. */
02db2b7b 3814 REGCP_SET(lastcp);
0f5d15d6
IZ
3815 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3816 the caller. */
3817 PL_reginput = locinput; /* Make position available to
3818 the callcc. */
3819 cache_re(PL_reg_call_cc->re);
3820 PL_regcc = PL_reg_call_cc->cc;
3821 PL_reg_call_cc = PL_reg_call_cc->prev;
3822 if (regmatch(cur_call_cc->node)) {
3823 PL_reg_call_cc = cur_call_cc;
3824 regcpblow(cp);
3825 sayYES;
3826 }
02db2b7b 3827 REGCP_UNWIND(lastcp);
0f5d15d6
IZ
3828 regcppop();
3829 PL_reg_call_cc = cur_call_cc;
3830 PL_regcc = cctmp;
3831 PL_reg_re = re;
3832 cache_re(re);
3833
3834 DEBUG_r(
3835 PerlIO_printf(Perl_debug_log,
3836 "%*s continuation failed...\n",
3837 REPORT_CODE_OFF+PL_regindent*2, "")
3838 );
7821416a 3839 sayNO_SILENT;
0f5d15d6 3840 }
7821416a
IZ
3841 if (locinput < PL_regtill) {
3842 DEBUG_r(PerlIO_printf(Perl_debug_log,
3843 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3844 PL_colors[4],
3845 (long)(locinput - PL_reg_starttry),
3846 (long)(PL_regtill - PL_reg_starttry),
3847 PL_colors[5]));
3848 sayNO_FINAL; /* Cannot match: too short. */
3849 }
3850 PL_reginput = locinput; /* put where regtry can find it */
3851 sayYES_FINAL; /* Success! */
7e5428c5 3852 case SUCCEED:
3280af22 3853 PL_reginput = locinput; /* put where regtry can find it */
7821416a 3854 sayYES_LOUD; /* Success! */
c277df42
IZ
3855 case SUSPEND:
3856 n = 1;
9fe1d20c 3857 PL_reginput = locinput;
9041c2e3 3858 goto do_ifmatch;
a0d0e21e 3859 case UNLESSM:
c277df42 3860 n = 0;
a0ed51b3 3861 if (scan->flags) {
efb30f32
HS
3862 s = HOPBACKc(locinput, scan->flags);
3863 if (!s)
3864 goto say_yes;
3865 PL_reginput = s;
a0ed51b3
LW
3866 }
3867 else
3868 PL_reginput = locinput;
c277df42
IZ
3869 goto do_ifmatch;
3870 case IFMATCH:
3871 n = 1;
a0ed51b3 3872 if (scan->flags) {
efb30f32
HS
3873 s = HOPBACKc(locinput, scan->flags);
3874 if (!s)
3875 goto say_no;
3876 PL_reginput = s;
a0ed51b3
LW
3877 }
3878 else
3879 PL_reginput = locinput;
3880
c277df42 3881 do_ifmatch:
c277df42
IZ
3882 inner = NEXTOPER(NEXTOPER(scan));
3883 if (regmatch(inner) != n) {
3884 say_no:
3885 if (logical) {
3886 logical = 0;
3887 sw = 0;
3888 goto do_longjump;
a0ed51b3
LW
3889 }
3890 else
c277df42
IZ
3891 sayNO;
3892 }
3893 say_yes:
3894 if (logical) {
3895 logical = 0;
3896 sw = 1;
3897 }
fe44a5e8 3898 if (OP(scan) == SUSPEND) {
3280af22 3899 locinput = PL_reginput;
565764a8 3900 nextchr = UCHARAT(locinput);
fe44a5e8 3901 }
c277df42
IZ
3902 /* FALL THROUGH. */
3903 case LONGJMP:
3904 do_longjump:
3905 next = scan + ARG(scan);
3906 if (next == scan)
3907 next = NULL;
a0d0e21e
LW
3908 break;
3909 default:
b900a521 3910 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 3911 PTR2UV(scan), OP(scan));
cea2e8a9 3912 Perl_croak(aTHX_ "regexp memory corruption");
a687059c 3913 }
02db2b7b 3914 reenter:
a0d0e21e
LW
3915 scan = next;
3916 }
a687059c 3917
a0d0e21e
LW
3918 /*
3919 * We get here only if there's trouble -- normally "case END" is
3920 * the terminating point.
3921 */
cea2e8a9 3922 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 3923 /*NOTREACHED*/
4633a7c4
LW
3924 sayNO;
3925
7821416a
IZ
3926yes_loud:
3927 DEBUG_r(
3928 PerlIO_printf(Perl_debug_log,
3929 "%*s %scould match...%s\n",
3930 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3931 );
3932 goto yes;
3933yes_final:
3934 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3935 PL_colors[4],PL_colors[5]));
4633a7c4
LW
3936yes:
3937#ifdef DEBUGGING
3280af22 3938 PL_regindent--;
4633a7c4 3939#endif
02db2b7b
IZ
3940
3941#if 0 /* Breaks $^R */
3942 if (unwind)
3943 regcpblow(firstcp);
3944#endif
4633a7c4
LW
3945 return 1;
3946
3947no:
7821416a
IZ
3948 DEBUG_r(
3949 PerlIO_printf(Perl_debug_log,
3950 "%*s %sfailed...%s\n",
3951 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3952 );
3953 goto do_no;
3954no_final:
3955do_no:
02db2b7b
IZ
3956 if (unwind) {
3957 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3958
3959 switch (uw->type) {
3960 case RE_UNWIND_BRANCH:
3961 case RE_UNWIND_BRANCHJ:
3962 {
3963 re_unwind_branch_t *uwb = &(uw->branch);
3964 I32 lastparen = uwb->lastparen;
9041c2e3 3965
02db2b7b
IZ
3966 REGCP_UNWIND(uwb->lastcp);
3967 for (n = *PL_reglastparen; n > lastparen; n--)
3968 PL_regendp[n] = -1;
3969 *PL_reglastparen = n;
3970 scan = next = uwb->next;
9041c2e3
NIS
3971 if ( !scan ||
3972 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
02db2b7b
IZ
3973 ? BRANCH : BRANCHJ) ) { /* Failure */
3974 unwind = uwb->prev;
3975#ifdef DEBUGGING
3976 PL_regindent--;
3977#endif
3978 goto do_no;
3979 }
3980 /* Have more choice yet. Reuse the same uwb. */
3981 /*SUPPRESS 560*/
3982 if ((n = (uwb->type == RE_UNWIND_BRANCH
3983 ? NEXT_OFF(next) : ARG(next))))
3984 next += n;
3985 else
3986 next = NULL; /* XXXX Needn't unwinding in this case... */
3987 uwb->next = next;
3988 next = NEXTOPER(scan);
3989 if (uwb->type == RE_UNWIND_BRANCHJ)
3990 next = NEXTOPER(next);
3991 locinput = uwb->locinput;
3992 nextchr = uwb->nextchr;
3993#ifdef DEBUGGING
3994 PL_regindent = uwb->regindent;
3995#endif
3996
3997 goto reenter;
3998 }
3999 /* NOT REACHED */
4000 default:
4001 Perl_croak(aTHX_ "regexp unwind memory corruption");
4002 }
4003 /* NOT REACHED */
4004 }
4633a7c4 4005#ifdef DEBUGGING
3280af22 4006 PL_regindent--;
4633a7c4 4007#endif
a0d0e21e 4008 return 0;
a687059c
LW
4009}
4010
4011/*
4012 - regrepeat - repeatedly match something simple, report how many
4013 */
4014/*
4015 * [This routine now assumes that it will only match on things of length 1.
4016 * That was true before, but now we assume scan - reginput is the count,
a0ed51b3 4017 * rather than incrementing count on every character. [Er, except utf8.]]
a687059c 4018 */
76e3520e 4019STATIC I32
cea2e8a9 4020S_regrepeat(pTHX_ regnode *p, I32 max)
a687059c 4021{
a0d0e21e 4022 register char *scan;
a0d0e21e 4023 register I32 c;
3280af22 4024 register char *loceol = PL_regeol;
a0ed51b3 4025 register I32 hardcount = 0;
53c4c00c 4026 register bool do_utf8 = PL_reg_match_utf8;
a0d0e21e 4027
3280af22 4028 scan = PL_reginput;
faf11cac
HS
4029 if (max == REG_INFTY)
4030 max = I32_MAX;
4031 else if (max < loceol - scan)
a0d0e21e 4032 loceol = scan + max;
a0d0e21e 4033 switch (OP(p)) {
22c35a8c 4034 case REG_ANY:
1aa99e6b 4035 if (do_utf8) {
ffc61ed2 4036 loceol = PL_regeol;
1aa99e6b 4037 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
4038 scan += UTF8SKIP(scan);
4039 hardcount++;
4040 }
4041 } else {
4042 while (scan < loceol && *scan != '\n')
4043 scan++;
a0ed51b3
LW
4044 }
4045 break;
ffc61ed2 4046 case SANY:
def8e4ea
JH
4047 if (do_utf8) {
4048 loceol = PL_regeol;
a0804c9e 4049 while (scan < loceol && hardcount < max) {
def8e4ea
JH
4050 scan += UTF8SKIP(scan);
4051 hardcount++;
4052 }
4053 }
4054 else
4055 scan = loceol;
a0ed51b3 4056 break;
f33976b4
DB
4057 case CANY:
4058 scan = loceol;
4059 break;
bbce6d69 4060 case EXACT: /* length of string is 1 */
cd439c50 4061 c = (U8)*STRING(p);
bbce6d69 4062 while (scan < loceol && UCHARAT(scan) == c)
4063 scan++;
4064 break;
4065 case EXACTF: /* length of string is 1 */
cd439c50 4066 c = (U8)*STRING(p);
bbce6d69 4067 while (scan < loceol &&
22c35a8c 4068 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
bbce6d69 4069 scan++;
4070 break;
4071 case EXACTFL: /* length of string is 1 */
3280af22 4072 PL_reg_flags |= RF_tainted;
cd439c50 4073 c = (U8)*STRING(p);
bbce6d69 4074 while (scan < loceol &&
22c35a8c 4075 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
a0d0e21e
LW
4076 scan++;
4077 break;
4078 case ANYOF:
ffc61ed2
JH
4079 if (do_utf8) {
4080 loceol = PL_regeol;
388cc4de 4081 while (hardcount < max && scan < loceol) {
5dec093f
AE
4082 if (!REGINCLASS_utf8(p, (U8*)scan))
4083 break;
ffc61ed2
JH
4084 scan += UTF8SKIP(scan);
4085 hardcount++;
4086 }
4087 } else {
7d3e948e 4088 while (scan < loceol && REGINCLASS(p, (U8*)scan))
ffc61ed2
JH
4089 scan++;
4090 }
a0d0e21e
LW
4091 break;
4092 case ALNUM:
1aa99e6b 4093 if (do_utf8) {
ffc61ed2 4094 loceol = PL_regeol;
8269fa76 4095 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4096 while (hardcount < max && scan < loceol &&
3568d838 4097 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4098 scan += UTF8SKIP(scan);
4099 hardcount++;
4100 }
4101 } else {
4102 while (scan < loceol && isALNUM(*scan))
4103 scan++;
a0ed51b3
LW
4104 }
4105 break;
bbce6d69 4106 case ALNUML:
3280af22 4107 PL_reg_flags |= RF_tainted;
1aa99e6b 4108 if (do_utf8) {
ffc61ed2 4109 loceol = PL_regeol;
1aa99e6b
IH
4110 while (hardcount < max && scan < loceol &&
4111 isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4112 scan += UTF8SKIP(scan);
4113 hardcount++;
4114 }
4115 } else {
4116 while (scan < loceol && isALNUM_LC(*scan))
4117 scan++;
a0ed51b3
LW
4118 }
4119 break;
a0d0e21e 4120 case NALNUM:
1aa99e6b 4121 if (do_utf8) {
ffc61ed2 4122 loceol = PL_regeol;
8269fa76 4123 LOAD_UTF8_CHARCLASS(alnum,"a");
1aa99e6b 4124 while (hardcount < max && scan < loceol &&
3568d838 4125 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4126 scan += UTF8SKIP(scan);
4127 hardcount++;
4128 }
4129 } else {
4130 while (scan < loceol && !isALNUM(*scan))
4131 scan++;
a0ed51b3
LW
4132 }
4133 break;
bbce6d69 4134 case NALNUML:
3280af22 4135 PL_reg_flags |= RF_tainted;
1aa99e6b 4136 if (do_utf8) {
ffc61ed2 4137 loceol = PL_regeol;
1aa99e6b
IH
4138 while (hardcount < max && scan < loceol &&
4139 !isALNUM_LC_utf8((U8*)scan)) {
ffc61ed2
JH
4140 scan += UTF8SKIP(scan);
4141 hardcount++;
4142 }
4143 } else {
4144 while (scan < loceol && !isALNUM_LC(*scan))
4145 scan++;
a0ed51b3
LW
4146 }
4147 break;
a0d0e21e 4148 case SPACE:
1aa99e6b 4149 if (do_utf8) {
ffc61ed2 4150 loceol = PL_regeol;
8269fa76 4151 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4152 while (hardcount < max && scan < loceol &&
3568d838
JH
4153 (*scan == ' ' ||
4154 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4155 scan += UTF8SKIP(scan);
4156 hardcount++;
4157 }
4158 } else {
4159 while (scan < loceol && isSPACE(*scan))
4160 scan++;
a0ed51b3
LW
4161 }
4162 break;
bbce6d69 4163 case SPACEL:
3280af22 4164 PL_reg_flags |= RF_tainted;
1aa99e6b 4165 if (do_utf8) {
ffc61ed2 4166 loceol = PL_regeol;
1aa99e6b 4167 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4168 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4169 scan += UTF8SKIP(scan);
4170 hardcount++;
4171 }
4172 } else {
4173 while (scan < loceol && isSPACE_LC(*scan))
4174 scan++;
a0ed51b3
LW
4175 }
4176 break;
a0d0e21e 4177 case NSPACE:
1aa99e6b 4178 if (do_utf8) {
ffc61ed2 4179 loceol = PL_regeol;
8269fa76 4180 LOAD_UTF8_CHARCLASS(space," ");
1aa99e6b 4181 while (hardcount < max && scan < loceol &&
3568d838
JH
4182 !(*scan == ' ' ||
4183 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
ffc61ed2
JH
4184 scan += UTF8SKIP(scan);
4185 hardcount++;
4186 }
4187 } else {
4188 while (scan < loceol && !isSPACE(*scan))
4189 scan++;
4190 break;
a0ed51b3 4191 }
bbce6d69 4192 case NSPACEL:
3280af22 4193 PL_reg_flags |= RF_tainted;
1aa99e6b 4194 if (do_utf8) {
ffc61ed2 4195 loceol = PL_regeol;
1aa99e6b 4196 while (hardcount < max && scan < loceol &&
ffc61ed2
JH
4197 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4198 scan += UTF8SKIP(scan);
4199 hardcount++;
4200 }
4201 } else {
4202 while (scan < loceol && !isSPACE_LC(*scan))
4203 scan++;
a0ed51b3
LW
4204 }
4205 break;
a0d0e21e 4206 case DIGIT:
1aa99e6b 4207 if (do_utf8) {
ffc61ed2 4208 loceol = PL_regeol;
8269fa76 4209 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4210 while (hardcount < max && scan < loceol &&
3568d838 4211 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4212 scan += UTF8SKIP(scan);
4213 hardcount++;
4214 }
4215 } else {
4216 while (scan < loceol && isDIGIT(*scan))
4217 scan++;
a0ed51b3
LW
4218 }
4219 break;
a0d0e21e 4220 case NDIGIT:
1aa99e6b 4221 if (do_utf8) {
ffc61ed2 4222 loceol = PL_regeol;
8269fa76 4223 LOAD_UTF8_CHARCLASS(digit,"0");
1aa99e6b 4224 while (hardcount < max && scan < loceol &&
3568d838 4225 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
ffc61ed2
JH
4226 scan += UTF8SKIP(scan);
4227 hardcount++;
4228 }
4229 } else {
4230 while (scan < loceol && !isDIGIT(*scan))
4231 scan++;
a0ed51b3
LW
4232 }
4233 break;
a0d0e21e
LW
4234 default: /* Called on something of 0 width. */
4235 break; /* So match right here or not at all. */
4236 }
a687059c 4237
a0ed51b3
LW
4238 if (hardcount)
4239 c = hardcount;
4240 else
4241 c = scan - PL_reginput;
3280af22 4242 PL_reginput = scan;
a687059c 4243
9041c2e3 4244 DEBUG_r(
c277df42
IZ
4245 {
4246 SV *prop = sv_newmortal();
4247
4248 regprop(prop, p);
9041c2e3
NIS
4249 PerlIO_printf(Perl_debug_log,
4250 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7b0972df 4251 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
c277df42 4252 });
9041c2e3 4253
a0d0e21e 4254 return(c);
a687059c
LW
4255}
4256
4257/*
c277df42 4258 - regrepeat_hard - repeatedly match something, report total lenth and length
9041c2e3 4259 *
c277df42
IZ
4260 * The repeater is supposed to have constant length.
4261 */
4262
76e3520e 4263STATIC I32
cea2e8a9 4264S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
c277df42 4265{
b7953727 4266 register char *scan = Nullch;
c277df42 4267 register char *start;
3280af22 4268 register char *loceol = PL_regeol;
a0ed51b3 4269 I32 l = 0;
708e3b05 4270 I32 count = 0, res = 1;
a0ed51b3
LW
4271
4272 if (!max)
4273 return 0;
c277df42 4274
3280af22 4275 start = PL_reginput;
53c4c00c 4276 if (PL_reg_match_utf8) {
708e3b05 4277 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4278 if (!count++) {
4279 l = 0;
4280 while (start < PL_reginput) {
4281 l++;
4282 start += UTF8SKIP(start);
4283 }
4284 *lp = l;
4285 if (l == 0)
4286 return max;
4287 }
4288 if (count == max)
4289 return count;
4290 }
4291 }
4292 else {
708e3b05 4293 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
a0ed51b3
LW
4294 if (!count++) {
4295 *lp = l = PL_reginput - start;
4296 if (max != REG_INFTY && l*max < loceol - scan)
4297 loceol = scan + l*max;
4298 if (l == 0)
4299 return max;
c277df42
IZ
4300 }
4301 }
4302 }
708e3b05 4303 if (!res)
3280af22 4304 PL_reginput = scan;
9041c2e3 4305
a0ed51b3 4306 return count;
c277df42
IZ
4307}
4308
4309/*
ffc61ed2
JH
4310- regclass_swash - prepare the utf8 swash
4311*/
4312
4313SV *
9e55ce06 4314Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 4315{
9e55ce06
JH
4316 SV *sw = NULL;
4317 SV *si = NULL;
4318 SV *alt = NULL;
ffc61ed2
JH
4319
4320 if (PL_regdata && PL_regdata->count) {
4321 U32 n = ARG(node);
4322
4323 if (PL_regdata->what[n] == 's') {
4324 SV *rv = (SV*)PL_regdata->data[n];
4325 AV *av = (AV*)SvRV((SV*)rv);
b11f357e 4326 SV **ary = AvARRAY(av);
9e55ce06 4327 SV **a, **b;
9041c2e3 4328
9e55ce06
JH
4329 /* See the end of regcomp.c:S_reglass() for
4330 * documentation of these array elements. */
4331
b11f357e
JH
4332 si = *ary;
4333 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4334 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4335
ffc61ed2
JH
4336 if (a)
4337 sw = *a;
4338 else if (si && doinit) {
4339 sw = swash_init("utf8", "", si, 1, 0);
4340 (void)av_store(av, 1, sw);
4341 }
9e55ce06
JH
4342 if (b)
4343 alt = *b;
ffc61ed2
JH
4344 }
4345 }
4346
9e55ce06
JH
4347 if (listsvp)
4348 *listsvp = si;
4349 if (altsvp)
4350 *altsvp = alt;
ffc61ed2
JH
4351
4352 return sw;
4353}
4354
4355/*
ba7b4546 4356 - reginclass - determine if a character falls into a character class
832705d4
JH
4357
4358 The n is the ANYOF regnode, the p is the target string, lenp
4359 is pointer to the maximum length of how far to go in the p
4360 (if the lenp is zero, UTF8SKIP(p) is used),
4361 do_utf8 tells whether the target string is in UTF-8.
4362
bbce6d69 4363 */
4364
76e3520e 4365STATIC bool
ba7b4546 4366S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
bbce6d69 4367{
ffc61ed2 4368 char flags = ANYOF_FLAGS(n);
bbce6d69 4369 bool match = FALSE;
cc07378b 4370 UV c = *p;
ae9ddab8 4371 STRLEN len = 0;
9e55ce06 4372 STRLEN plen;
1aa99e6b 4373
ae9ddab8
JH
4374 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4375 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
4376 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
bbce6d69 4377
0f0076b4 4378 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
ffc61ed2 4379 if (do_utf8 || (flags & ANYOF_UNICODE)) {
9e55ce06
JH
4380 if (lenp)
4381 *lenp = 0;
ffc61ed2 4382 if (do_utf8 && !ANYOF_RUNTIME(n)) {
ffc61ed2
JH
4383 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4384 match = TRUE;
bbce6d69 4385 }
3568d838 4386 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
1aa99e6b 4387 match = TRUE;
ffc61ed2 4388 if (!match) {
9e55ce06
JH
4389 AV *av;
4390 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
ffc61ed2
JH
4391
4392 if (sw) {
3568d838 4393 if (swash_fetch(sw, p, do_utf8))
ffc61ed2
JH
4394 match = TRUE;
4395 else if (flags & ANYOF_FOLD) {
9e55ce06
JH
4396 if (!match && lenp && av) {
4397 I32 i;
4398
4399 for (i = 0; i <= av_len(av); i++) {
4400 SV* sv = *av_fetch(av, i, FALSE);
4401 STRLEN len;
4402 char *s = SvPV(sv, len);
4403
061b10df 4404 if (len <= plen && memEQ(s, (char*)p, len)) {
9e55ce06
JH
4405 *lenp = len;
4406 match = TRUE;
4407 break;
4408 }
4409 }
4410 }
4411 if (!match) {
4a623e43
JH
4412 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
4413 STRLEN tmplen;
4414
9e55ce06
JH
4415 to_utf8_fold(p, tmpbuf, &tmplen);
4416 if (swash_fetch(sw, tmpbuf, do_utf8))
4417 match = TRUE;
4418 }
ffc61ed2
JH
4419 }
4420 }
bbce6d69 4421 }
9e55ce06 4422 if (match && lenp && *lenp == 0)
0f0076b4 4423 *lenp = UNISKIP(NATIVE_TO_UNI(c));
bbce6d69 4424 }
1aa99e6b 4425 if (!match && c < 256) {
ffc61ed2
JH
4426 if (ANYOF_BITMAP_TEST(n, c))
4427 match = TRUE;
4428 else if (flags & ANYOF_FOLD) {
eb160463 4429 U8 f;
a0ed51b3 4430
ffc61ed2
JH
4431 if (flags & ANYOF_LOCALE) {
4432 PL_reg_flags |= RF_tainted;
4433 f = PL_fold_locale[c];
4434 }
4435 else
4436 f = PL_fold[c];
4437 if (f != c && ANYOF_BITMAP_TEST(n, f))
4438 match = TRUE;
4439 }
4440
4441 if (!match && (flags & ANYOF_CLASS)) {
a0ed51b3 4442 PL_reg_flags |= RF_tainted;
ffc61ed2
JH
4443 if (
4444 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4445 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4446 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4447 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4448 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4449 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4450 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4451 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4452 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4453 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4454 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4455 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4456 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4457 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4458 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4459 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4460 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4461 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4462 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4463 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4474 ) /* How's that for a conditional? */
4475 {
4476 match = TRUE;
4477 }
a0ed51b3 4478 }
a0ed51b3
LW
4479 }
4480
a0ed51b3
LW
4481 return (flags & ANYOF_INVERT) ? !match : match;
4482}
161b471a 4483
dfe13c55 4484STATIC U8 *
cea2e8a9 4485S_reghop(pTHX_ U8 *s, I32 off)
9041c2e3 4486{
1aa99e6b
IH
4487 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4488}
4489
4490STATIC U8 *
4491S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
9041c2e3 4492{
a0ed51b3 4493 if (off >= 0) {
1aa99e6b 4494 while (off-- && s < lim) {
ffc61ed2 4495 /* XXX could check well-formedness here */
a0ed51b3 4496 s += UTF8SKIP(s);
ffc61ed2 4497 }
a0ed51b3
LW
4498 }
4499 else {
4500 while (off++) {
1aa99e6b 4501 if (s > lim) {
a0ed51b3 4502 s--;
ffc61ed2 4503 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4504 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4505 s--;
ffc61ed2
JH
4506 }
4507 /* XXX could check well-formedness here */
a0ed51b3
LW
4508 }
4509 }
4510 }
4511 return s;
4512}
161b471a 4513
dfe13c55 4514STATIC U8 *
1aa99e6b 4515S_reghopmaybe(pTHX_ U8 *s, I32 off)
9041c2e3 4516{
1aa99e6b
IH
4517 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4518}
4519
4520STATIC U8 *
4521S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
a0ed51b3
LW
4522{
4523 if (off >= 0) {
1aa99e6b 4524 while (off-- && s < lim) {
ffc61ed2 4525 /* XXX could check well-formedness here */
a0ed51b3 4526 s += UTF8SKIP(s);
ffc61ed2 4527 }
a0ed51b3
LW
4528 if (off >= 0)
4529 return 0;
4530 }
4531 else {
4532 while (off++) {
1aa99e6b 4533 if (s > lim) {
a0ed51b3 4534 s--;
ffc61ed2 4535 if (UTF8_IS_CONTINUED(*s)) {
1aa99e6b 4536 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
a0ed51b3 4537 s--;
ffc61ed2
JH
4538 }
4539 /* XXX could check well-formedness here */
a0ed51b3
LW
4540 }
4541 else
4542 break;
4543 }
4544 if (off <= 0)
4545 return 0;
4546 }
4547 return s;
4548}
51371543 4549
51371543 4550static void
acfe0abc 4551restore_pos(pTHX_ void *arg)
51371543 4552{
51371543
GS
4553 if (PL_reg_eval_set) {
4554 if (PL_reg_oldsaved) {
4555 PL_reg_re->subbeg = PL_reg_oldsaved;
4556 PL_reg_re->sublen = PL_reg_oldsavedlen;
4557 RX_MATCH_COPIED_on(PL_reg_re);
4558 }
4559 PL_reg_magic->mg_len = PL_reg_oldpos;
4560 PL_reg_eval_set = 0;
4561 PL_curpm = PL_reg_oldcurpm;
4562 }
4563}
33b8afdf
JH
4564
4565STATIC void
4566S_to_utf8_substr(pTHX_ register regexp *prog)
4567{
4568 SV* sv;
4569 if (prog->float_substr && !prog->float_utf8) {
4570 prog->float_utf8 = sv = NEWSV(117, 0);
d3848741 4571 SvSetSV(sv, prog->float_substr);
33b8afdf
JH
4572 sv_utf8_upgrade(sv);
4573 if (SvTAIL(prog->float_substr))
4574 SvTAIL_on(sv);
4575 if (prog->float_substr == prog->check_substr)
4576 prog->check_utf8 = sv;
4577 }
4578 if (prog->anchored_substr && !prog->anchored_utf8) {
4579 prog->anchored_utf8 = sv = NEWSV(118, 0);
d3848741 4580 SvSetSV(sv, prog->anchored_substr);
33b8afdf
JH
4581 sv_utf8_upgrade(sv);
4582 if (SvTAIL(prog->anchored_substr))
4583 SvTAIL_on(sv);
4584 if (prog->anchored_substr == prog->check_substr)
4585 prog->check_utf8 = sv;
4586 }
4587}
4588
4589STATIC void
4590S_to_byte_substr(pTHX_ register regexp *prog)
4591{
4592 SV* sv;
4593 if (prog->float_utf8 && !prog->float_substr) {
4594 prog->float_substr = sv = NEWSV(117, 0);
d3848741 4595 SvSetSV(sv, prog->float_utf8);
33b8afdf
JH
4596 if (sv_utf8_downgrade(sv, TRUE)) {
4597 if (SvTAIL(prog->float_utf8))
4598 SvTAIL_on(sv);
4599 } else {
4600 SvREFCNT_dec(sv);
4601 prog->float_substr = sv = &PL_sv_undef;
4602 }
4603 if (prog->float_utf8 == prog->check_utf8)
4604 prog->check_substr = sv;
4605 }
4606 if (prog->anchored_utf8 && !prog->anchored_substr) {
4607 prog->anchored_substr = sv = NEWSV(118, 0);
d3848741 4608 SvSetSV(sv, prog->anchored_utf8);
33b8afdf
JH
4609 if (sv_utf8_downgrade(sv, TRUE)) {
4610 if (SvTAIL(prog->anchored_utf8))
4611 SvTAIL_on(sv);
4612 } else {
4613 SvREFCNT_dec(sv);
4614 prog->anchored_substr = sv = &PL_sv_undef;
4615 }
4616 if (prog->anchored_utf8 == prog->check_utf8)
4617 prog->check_substr = sv;
4618 }
4619}