This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Updated escaping code. utf8 regex debug output improvements
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
5 * "One Ring to rule them all, One Ring to find them..."
6 */
7
61296642
DM
8/* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
9041c2e3 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 **** Alterations to Henry's code are...
57 ****
4bb101f2 58 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 59 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 60 ****
9ef589d8
LW
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
a687059c
LW
63 *
64 * Beware that some of this code is subtly aware of the way operator
65 * precedence is structured in regular expressions. Serious changes in
66 * regular-expression syntax might require a total rethink.
67 */
68#include "EXTERN.h"
864dbfa3 69#define PERL_IN_REGEXEC_C
a687059c 70#include "perl.h"
0f5d15d6 71
54df2634
NC
72#ifdef PERL_IN_XSUB_RE
73# include "re_comp.h"
74#else
75# include "regcomp.h"
76#endif
a687059c 77
c277df42
IZ
78#define RF_tainted 1 /* tainted information used? */
79#define RF_warned 2 /* warned about big count? */
ce862d02 80#define RF_evaled 4 /* Did an EVAL with setting? */
ab3bbdeb 81#define RF_utf8 8 /* Pattern contains multibyte chars? */
a0ed51b3 82
eb160463 83#define UTF ((PL_reg_flags & RF_utf8) != 0)
ce862d02
IZ
84
85#define RS_init 1 /* eval environment created */
86#define RS_set 2 /* replsv value is set */
c277df42 87
a687059c
LW
88#ifndef STATIC
89#define STATIC static
90#endif
91
32fc9b6a 92#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 93
c277df42
IZ
94/*
95 * Forwards.
96 */
97
33b8afdf 98#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 99#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 100
3dab1dad
YO
101#define HOPc(pos,off) \
102 (char *)(PL_reg_match_utf8 \
52657f30 103 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
104 : (U8*)(pos + off))
105#define HOPBACKc(pos, off) \
07be1b83
YO
106 (char*)(PL_reg_match_utf8\
107 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
108 : (pos - off >= PL_bostr) \
8e11feef 109 ? (U8*)pos - off \
3dab1dad 110 : NULL)
efb30f32 111
53c4c00c 112#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
1aa99e6b 113#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 114
1a4fad37
AL
115#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
116 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
117#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
118#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
119#define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
120#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
51371543 121
3dab1dad
YO
122/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
123
5f80c4cf 124/* for use after a quantifier and before an EXACT-like node -- japhy */
e2d8ce26
JP
125#define JUMPABLE(rn) ( \
126 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
cca55fe3
JP
127 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
128 OP(rn) == PLUS || OP(rn) == MINMOD || \
3dab1dad 129 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26
JP
130)
131
cca55fe3 132#define HAS_TEXT(rn) ( \
3dab1dad 133 PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
cca55fe3 134)
e2d8ce26 135
a84d97b6
HS
136/*
137 Search for mandatory following text node; for lookahead, the text must
138 follow but for lookbehind (rn->flags != 0) we skip to the next step.
139*/
cca55fe3 140#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
141 while (JUMPABLE(rn)) { \
142 const OPCODE type = OP(rn); \
143 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 144 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 145 else if (type == PLUS) \
cca55fe3 146 rn = NEXTOPER(rn); \
3dab1dad 147 else if (type == IFMATCH) \
a84d97b6 148 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 149 else rn += NEXT_OFF(rn); \
3dab1dad 150 } \
5f80c4cf 151} STMT_END
74750237 152
acfe0abc 153static void restore_pos(pTHX_ void *arg);
51371543 154
76e3520e 155STATIC CHECKPOINT
cea2e8a9 156S_regcppush(pTHX_ I32 parenfloor)
a0d0e21e 157{
97aff369 158 dVAR;
a3b680e6 159 const int retval = PL_savestack_ix;
b1ce53c5 160#define REGCP_PAREN_ELEMS 4
a3b680e6 161 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
a0d0e21e 162 int p;
40a82448 163 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 164
e49a9654
IH
165 if (paren_elems_to_push < 0)
166 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
167
a01268b5 168#define REGCP_OTHER_ELEMS 6
4b3c1a47 169 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
3280af22 170 for (p = PL_regsize; p > parenfloor; p--) {
b1ce53c5 171/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
cf93c79d
IZ
172 SSPUSHINT(PL_regendp[p]);
173 SSPUSHINT(PL_regstartp[p]);
3280af22 174 SSPUSHPTR(PL_reg_start_tmp[p]);
a0d0e21e 175 SSPUSHINT(p);
40a82448
DM
176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
177 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
178 (UV)p, (IV)PL_regstartp[p],
179 (IV)(PL_reg_start_tmp[p] - PL_bostr),
180 (IV)PL_regendp[p]
181 ));
a0d0e21e 182 }
b1ce53c5 183/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
3280af22
NIS
184 SSPUSHINT(PL_regsize);
185 SSPUSHINT(*PL_reglastparen);
a01268b5 186 SSPUSHINT(*PL_reglastcloseparen);
3280af22 187 SSPUSHPTR(PL_reginput);
41123dfd
JH
188#define REGCP_FRAME_ELEMS 2
189/* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
190 * are needed for the regexp context stack bookkeeping. */
191 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
b1ce53c5 192 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
41123dfd 193
a0d0e21e
LW
194 return retval;
195}
196
c277df42 197/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
198#define REGCP_SET(cp) \
199 DEBUG_STATE_r( \
200 if (cp != PL_savestack_ix) \
201 PerlIO_printf(Perl_debug_log, \
faccc32b 202 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
203 (IV)PL_savestack_ix)); \
204 cp = PL_savestack_ix
c3464db5 205
ab3bbdeb
YO
206#define REGCP_UNWIND(cp) \
207 DEBUG_EXECUTE_r( \
208 if (cp != PL_savestack_ix) \
c3464db5 209 PerlIO_printf(Perl_debug_log, \
faccc32b 210 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
211 (IV)(cp), (IV)PL_savestack_ix)); \
212 regcpblow(cp)
c277df42 213
76e3520e 214STATIC char *
097eb12c 215S_regcppop(pTHX_ const regexp *rex)
a0d0e21e 216{
97aff369 217 dVAR;
b1ce53c5 218 I32 i;
a0d0e21e 219 char *input;
b1ce53c5 220
a3621e74
YO
221 GET_RE_DEBUG_FLAGS_DECL;
222
b1ce53c5 223 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
a0d0e21e 224 i = SSPOPINT;
b1ce53c5
JH
225 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
226 i = SSPOPINT; /* Parentheses elements to pop. */
a0d0e21e 227 input = (char *) SSPOPPTR;
a01268b5 228 *PL_reglastcloseparen = SSPOPINT;
3280af22
NIS
229 *PL_reglastparen = SSPOPINT;
230 PL_regsize = SSPOPINT;
b1ce53c5
JH
231
232 /* Now restore the parentheses context. */
41123dfd
JH
233 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
234 i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 235 I32 tmps;
097eb12c 236 U32 paren = (U32)SSPOPINT;
3280af22 237 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
cf93c79d
IZ
238 PL_regstartp[paren] = SSPOPINT;
239 tmps = SSPOPINT;
3280af22
NIS
240 if (paren <= *PL_reglastparen)
241 PL_regendp[paren] = tmps;
a3621e74 242 DEBUG_EXECUTE_r(
c3464db5 243 PerlIO_printf(Perl_debug_log,
b900a521 244 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
9041c2e3 245 (UV)paren, (IV)PL_regstartp[paren],
b900a521 246 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
9041c2e3 247 (IV)PL_regendp[paren],
3280af22 248 (paren > *PL_reglastparen ? "(no)" : ""));
c277df42 249 );
a0d0e21e 250 }
a3621e74 251 DEBUG_EXECUTE_r(
bb7a0f54 252 if (*PL_reglastparen + 1 <= rex->nparens) {
c3464db5 253 PerlIO_printf(Perl_debug_log,
faccc32b 254 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
4f639d21 255 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
c277df42
IZ
256 }
257 );
daf18116 258#if 1
dafc8851
JH
259 /* It would seem that the similar code in regtry()
260 * already takes care of this, and in fact it is in
261 * a better location to since this code can #if 0-ed out
262 * but the code in regtry() is needed or otherwise tests
263 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
264 * (as of patchlevel 7877) will fail. Then again,
265 * this code seems to be necessary or otherwise
266 * building DynaLoader will fail:
267 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
268 * --jhi */
bb7a0f54 269 for (i = *PL_reglastparen + 1; (U32)i <= rex->nparens; i++) {
097eb12c
AL
270 if (i > PL_regsize)
271 PL_regstartp[i] = -1;
272 PL_regendp[i] = -1;
a0d0e21e 273 }
dafc8851 274#endif
a0d0e21e
LW
275 return input;
276}
277
02db2b7b 278#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 279
a687059c 280/*
e50aee73 281 * pregexec and friends
a687059c
LW
282 */
283
76234dfb 284#ifndef PERL_IN_XSUB_RE
a687059c 285/*
c277df42 286 - pregexec - match a regexp against a string
a687059c 287 */
c277df42 288I32
864dbfa3 289Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
c3464db5 290 char *strbeg, I32 minend, SV *screamer, U32 nosave)
c277df42
IZ
291/* strend: pointer to null at end of string */
292/* strbeg: real beginning of string */
293/* minend: end of match must be >=minend after stringarg. */
294/* nosave: For optimizations. */
295{
296 return
9041c2e3 297 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
298 nosave ? 0 : REXEC_COPY_STR);
299}
76234dfb 300#endif
22e551b9 301
9041c2e3 302/*
cad2e5aa
JH
303 * Need to implement the following flags for reg_anch:
304 *
305 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
306 * USE_INTUIT_ML
307 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
308 * INTUIT_AUTORITATIVE_ML
309 * INTUIT_ONCE_NOML - Intuit can match in one location only.
310 * INTUIT_ONCE_ML
311 *
312 * Another flag for this function: SECOND_TIME (so that float substrs
313 * with giant delta may be not rechecked).
314 */
315
316/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
317
3f7c398e 318/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
319 Otherwise, only SvCUR(sv) is used to get strbeg. */
320
321/* XXXX We assume that strpos is strbeg unless sv. */
322
6eb5f6b9
JH
323/* XXXX Some places assume that there is a fixed substring.
324 An update may be needed if optimizer marks as "INTUITable"
325 RExen without fixed substrings. Similarly, it is assumed that
326 lengths of all the strings are no more than minlen, thus they
327 cannot come from lookahead.
328 (Or minlen should take into account lookahead.) */
329
2c2d71f5
JH
330/* A failure to find a constant substring means that there is no need to make
331 an expensive call to REx engine, thus we celebrate a failure. Similarly,
332 finding a substring too deep into the string means that less calls to
30944b6d
IZ
333 regtry() should be needed.
334
335 REx compiler's optimizer found 4 possible hints:
336 a) Anchored substring;
337 b) Fixed substring;
338 c) Whether we are anchored (beginning-of-line or \G);
339 d) First node (of those at offset 0) which may distingush positions;
6eb5f6b9 340 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
341 string which does not contradict any of them.
342 */
2c2d71f5 343
6eb5f6b9
JH
344/* Most of decisions we do here should have been done at compile time.
345 The nodes of the REx which we used for the search should have been
346 deleted from the finite automaton. */
347
cad2e5aa
JH
348char *
349Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
350 char *strend, U32 flags, re_scream_pos_data *data)
351{
97aff369 352 dVAR;
b7953727 353 register I32 start_shift = 0;
cad2e5aa 354 /* Should be nonnegative! */
b7953727 355 register I32 end_shift = 0;
2c2d71f5
JH
356 register char *s;
357 register SV *check;
a1933d95 358 char *strbeg;
cad2e5aa 359 char *t;
a3b680e6 360 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
cad2e5aa 361 I32 ml_anch;
bd61b366
SS
362 register char *other_last = NULL; /* other substr checked before this */
363 char *check_at = NULL; /* check substr found at this pos */
1df70142 364 const I32 multiline = prog->reganch & PMf_MULTILINE;
30944b6d 365#ifdef DEBUGGING
890ce7af 366 const char * const i_strpos = strpos;
30944b6d 367#endif
a3621e74
YO
368
369 GET_RE_DEBUG_FLAGS_DECL;
370
a30b2f1f 371 RX_MATCH_UTF8_set(prog,do_utf8);
cad2e5aa 372
b8d68ded 373 if (prog->reganch & ROPT_UTF8) {
b8d68ded
JH
374 PL_reg_flags |= RF_utf8;
375 }
ab3bbdeb
YO
376 DEBUG_EXECUTE_r(
377 debug_start_match(prog, do_utf8, strpos, strend,
378 "Guessing start of match for");
2a782b5b 379 );
cad2e5aa 380
c344f387
JH
381 /* CHR_DIST() would be more correct here but it makes things slow. */
382 if (prog->minlen > strend - strpos) {
a3621e74 383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 384 "String too short... [re_intuit_start]\n"));
cad2e5aa 385 goto fail;
2c2d71f5 386 }
a1933d95 387 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
1aa99e6b 388 PL_regeol = strend;
33b8afdf
JH
389 if (do_utf8) {
390 if (!prog->check_utf8 && prog->check_substr)
391 to_utf8_substr(prog);
392 check = prog->check_utf8;
393 } else {
394 if (!prog->check_substr && prog->check_utf8)
395 to_byte_substr(prog);
396 check = prog->check_substr;
397 }
398 if (check == &PL_sv_undef) {
a3621e74 399 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
400 "Non-utf string cannot match utf check string\n"));
401 goto fail;
402 }
2c2d71f5 403 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
cad2e5aa
JH
404 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
405 || ( (prog->reganch & ROPT_ANCH_BOL)
7fba1cd6 406 && !multiline ) ); /* Check after \n? */
cad2e5aa 407
7e25d62c
JH
408 if (!ml_anch) {
409 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
410 | ROPT_IMPLICIT)) /* not a real BOL */
3f7c398e 411 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
412 && sv && !SvROK(sv)
413 && (strpos != strbeg)) {
a3621e74 414 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
415 goto fail;
416 }
417 if (prog->check_offset_min == prog->check_offset_max &&
cce850e4 418 !(prog->reganch & ROPT_CANY_SEEN)) {
2c2d71f5 419 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
420 I32 slen;
421
1aa99e6b 422 s = HOP3c(strpos, prog->check_offset_min, strend);
653099ff
GS
423 if (SvTAIL(check)) {
424 slen = SvCUR(check); /* >= 1 */
cad2e5aa 425
9041c2e3 426 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 427 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 428 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 429 goto fail_finish;
cad2e5aa
JH
430 }
431 /* Now should match s[0..slen-2] */
432 slen--;
3f7c398e 433 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 434 || (slen > 1
3f7c398e 435 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 436 report_neq:
a3621e74 437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
438 goto fail_finish;
439 }
cad2e5aa 440 }
3f7c398e 441 else if (*SvPVX_const(check) != *s
653099ff 442 || ((slen = SvCUR(check)) > 1
3f7c398e 443 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 444 goto report_neq;
c315bfe8 445 check_at = s;
2c2d71f5 446 goto success_at_start;
7e25d62c 447 }
cad2e5aa 448 }
2c2d71f5 449 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 450 s = strpos;
2c2d71f5 451 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
2c2d71f5 452 end_shift = prog->minlen - start_shift -
653099ff 453 CHR_SVLEN(check) + (SvTAIL(check) != 0);
2c2d71f5 454 if (!ml_anch) {
a3b680e6 455 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 456 - (SvTAIL(check) != 0);
a3b680e6 457 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
458
459 if (end_shift < eshift)
460 end_shift = eshift;
461 }
cad2e5aa 462 }
2c2d71f5 463 else { /* Can match at random position */
cad2e5aa
JH
464 ml_anch = 0;
465 s = strpos;
2c2d71f5
JH
466 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
467 /* Should be nonnegative! */
468 end_shift = prog->minlen - start_shift -
653099ff 469 CHR_SVLEN(check) + (SvTAIL(check) != 0);
cad2e5aa
JH
470 }
471
2c2d71f5 472#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 473 if (end_shift < 0)
6bbae5e6 474 Perl_croak(aTHX_ "panic: end_shift");
2c2d71f5
JH
475#endif
476
2c2d71f5
JH
477 restart:
478 /* Find a possible match in the region s..strend by looking for
479 the "check" substring in the region corrected by start/end_shift. */
cad2e5aa 480 if (flags & REXEC_SCREAM) {
cad2e5aa 481 I32 p = -1; /* Internal iterator of scream. */
a3b680e6 482 I32 * const pp = data ? data->scream_pos : &p;
cad2e5aa 483
2c2d71f5
JH
484 if (PL_screamfirst[BmRARE(check)] >= 0
485 || ( BmRARE(check) == '\n'
486 && (BmPREVIOUS(check) == SvCUR(check) - 1)
487 && SvTAIL(check) ))
9041c2e3 488 s = screaminstr(sv, check,
2c2d71f5 489 start_shift + (s - strbeg), end_shift, pp, 0);
cad2e5aa 490 else
2c2d71f5 491 goto fail_finish;
4addbd3b
HS
492 /* we may be pointing at the wrong string */
493 if (s && RX_MATCH_COPIED(prog))
3f7c398e 494 s = strbeg + (s - SvPVX_const(sv));
cad2e5aa
JH
495 if (data)
496 *data->scream_olds = s;
497 }
f33976b4 498 else if (prog->reganch & ROPT_CANY_SEEN)
3baa4c62
JH
499 s = fbm_instr((U8*)(s + start_shift),
500 (U8*)(strend - end_shift),
7fba1cd6 501 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa 502 else
1aa99e6b
IH
503 s = fbm_instr(HOP3(s, start_shift, strend),
504 HOP3(strend, -end_shift, strbeg),
7fba1cd6 505 check, multiline ? FBMrf_MULTILINE : 0);
cad2e5aa
JH
506
507 /* Update the count-of-usability, remove useless subpatterns,
508 unshift s. */
2c2d71f5 509
ab3bbdeb
YO
510 DEBUG_EXECUTE_r({
511 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
512 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
513 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 514 (s ? "Found" : "Did not find"),
ab3bbdeb
YO
515 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
516 ? "anchored" : "floating"),
517 quoted,
518 RE_SV_TAIL(check),
519 (s ? " at offset " : "...\n") );
520 });
2c2d71f5
JH
521
522 if (!s)
523 goto fail_finish;
524
6eb5f6b9
JH
525 check_at = s;
526
2c2d71f5 527 /* Finish the diagnostic message */
a3621e74 528 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5
JH
529
530 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
531 Start with the other substr.
532 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 533 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
534 *always* match. Probably should be marked during compile...
535 Probably it is right to do no SCREAM here...
536 */
537
33b8afdf 538 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
30944b6d 539 /* Take into account the "other" substring. */
2c2d71f5
JH
540 /* XXXX May be hopelessly wrong for UTF... */
541 if (!other_last)
6eb5f6b9 542 other_last = strpos;
33b8afdf 543 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
544 do_other_anchored:
545 {
890ce7af
AL
546 char * const last = HOP3c(s, -start_shift, strbeg);
547 char *last1, *last2;
be8e71aa 548 char * const saved_s = s;
33b8afdf 549 SV* must;
2c2d71f5 550
2c2d71f5
JH
551 t = s - prog->check_offset_max;
552 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 553 && (!do_utf8
0ce71af7 554 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 555 && t > strpos)))
6f207bd3 556 NOOP;
2c2d71f5
JH
557 else
558 t = strpos;
1aa99e6b 559 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
560 if (t < other_last) /* These positions already checked */
561 t = other_last;
1aa99e6b 562 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
563 if (last < last1)
564 last1 = last;
565 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
566 /* On end-of-str: see comment below. */
33b8afdf
JH
567 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
568 if (must == &PL_sv_undef) {
569 s = (char*)NULL;
a3621e74 570 DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
571 }
572 else
573 s = fbm_instr(
574 (unsigned char*)t,
575 HOP3(HOP3(last1, prog->anchored_offset, strend)
576 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
577 must,
7fba1cd6 578 multiline ? FBMrf_MULTILINE : 0
33b8afdf 579 );
ab3bbdeb
YO
580 DEBUG_EXECUTE_r({
581 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
582 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
583 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 584 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
585 quoted, RE_SV_TAIL(must));
586 });
587
588
2c2d71f5
JH
589 if (!s) {
590 if (last1 >= last2) {
a3621e74 591 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
592 ", giving up...\n"));
593 goto fail_finish;
594 }
a3621e74 595 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 596 ", trying floating at offset %ld...\n",
be8e71aa 597 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
598 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
599 s = HOP3c(last, 1, strend);
2c2d71f5
JH
600 goto restart;
601 }
602 else {
a3621e74 603 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 604 (long)(s - i_strpos)));
1aa99e6b
IH
605 t = HOP3c(s, -prog->anchored_offset, strbeg);
606 other_last = HOP3c(s, 1, strend);
be8e71aa 607 s = saved_s;
2c2d71f5
JH
608 if (t == strpos)
609 goto try_at_start;
2c2d71f5
JH
610 goto try_at_offset;
611 }
30944b6d 612 }
2c2d71f5
JH
613 }
614 else { /* Take into account the floating substring. */
33b8afdf 615 char *last, *last1;
be8e71aa 616 char * const saved_s = s;
33b8afdf
JH
617 SV* must;
618
619 t = HOP3c(s, -start_shift, strbeg);
620 last1 = last =
621 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
622 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
623 last = HOP3c(t, prog->float_max_offset, strend);
624 s = HOP3c(t, prog->float_min_offset, strend);
625 if (s < other_last)
626 s = other_last;
2c2d71f5 627 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
33b8afdf
JH
628 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
629 /* fbm_instr() takes into account exact value of end-of-str
630 if the check is SvTAIL(ed). Since false positives are OK,
631 and end-of-str is not later than strend we are OK. */
632 if (must == &PL_sv_undef) {
633 s = (char*)NULL;
a3621e74 634 DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
635 }
636 else
2c2d71f5 637 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
638 (unsigned char*)last + SvCUR(must)
639 - (SvTAIL(must)!=0),
7fba1cd6 640 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb
YO
641 DEBUG_EXECUTE_r({
642 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
643 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
644 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 645 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
646 quoted, RE_SV_TAIL(must));
647 });
33b8afdf
JH
648 if (!s) {
649 if (last1 == last) {
a3621e74 650 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
651 ", giving up...\n"));
652 goto fail_finish;
2c2d71f5 653 }
a3621e74 654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 655 ", trying anchored starting at offset %ld...\n",
be8e71aa 656 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
657 other_last = last;
658 s = HOP3c(t, 1, strend);
659 goto restart;
660 }
661 else {
a3621e74 662 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
663 (long)(s - i_strpos)));
664 other_last = s; /* Fix this later. --Hugo */
be8e71aa 665 s = saved_s;
33b8afdf
JH
666 if (t == strpos)
667 goto try_at_start;
668 goto try_at_offset;
669 }
2c2d71f5 670 }
cad2e5aa 671 }
2c2d71f5
JH
672
673 t = s - prog->check_offset_max;
2c2d71f5 674 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1d86a7f9 675 && (!do_utf8
0ce71af7 676 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
1aa99e6b 677 && t > strpos))) {
2c2d71f5
JH
678 /* Fixed substring is found far enough so that the match
679 cannot start at strpos. */
680 try_at_offset:
cad2e5aa 681 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
682 /* Eventually fbm_*() should handle this, but often
683 anchored_offset is not 0, so this check will not be wasted. */
684 /* XXXX In the code below we prefer to look for "^" even in
685 presence of anchored substrings. And we search even
686 beyond the found float position. These pessimizations
687 are historical artefacts only. */
688 find_anchor:
2c2d71f5 689 while (t < strend - prog->minlen) {
cad2e5aa 690 if (*t == '\n') {
4ee3650e 691 if (t < check_at - prog->check_offset_min) {
33b8afdf 692 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
693 /* Since we moved from the found position,
694 we definitely contradict the found anchored
30944b6d
IZ
695 substr. Due to the above check we do not
696 contradict "check" substr.
697 Thus we can arrive here only if check substr
698 is float. Redo checking for "other"=="fixed".
699 */
9041c2e3 700 strpos = t + 1;
a3621e74 701 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 702 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
703 goto do_other_anchored;
704 }
4ee3650e
GS
705 /* We don't contradict the found floating substring. */
706 /* XXXX Why not check for STCLASS? */
cad2e5aa 707 s = t + 1;
a3621e74 708 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 709 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
710 goto set_useful;
711 }
4ee3650e
GS
712 /* Position contradicts check-string */
713 /* XXXX probably better to look for check-string
714 than for "\n", so one should lower the limit for t? */
a3621e74 715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 716 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 717 other_last = strpos = s = t + 1;
cad2e5aa
JH
718 goto restart;
719 }
720 t++;
721 }
a3621e74 722 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 723 PL_colors[0], PL_colors[1]));
2c2d71f5 724 goto fail_finish;
cad2e5aa 725 }
f5952150 726 else {
a3621e74 727 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 728 PL_colors[0], PL_colors[1]));
f5952150 729 }
cad2e5aa
JH
730 s = t;
731 set_useful:
33b8afdf 732 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
733 }
734 else {
f5952150 735 /* The found string does not prohibit matching at strpos,
2c2d71f5 736 - no optimization of calling REx engine can be performed,
f5952150
GS
737 unless it was an MBOL and we are not after MBOL,
738 or a future STCLASS check will fail this. */
2c2d71f5
JH
739 try_at_start:
740 /* Even in this situation we may use MBOL flag if strpos is offset
741 wrt the start of the string. */
05b4157f 742 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 743 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d
IZ
744 /* May be due to an implicit anchor of m{.*foo} */
745 && !(prog->reganch & ROPT_IMPLICIT))
746 {
cad2e5aa
JH
747 t = strpos;
748 goto find_anchor;
749 }
a3621e74 750 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 751 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
e4584336 752 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 753 );
2c2d71f5 754 success_at_start:
30944b6d 755 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
33b8afdf
JH
756 && (do_utf8 ? (
757 prog->check_utf8 /* Could be deleted already */
758 && --BmUSEFUL(prog->check_utf8) < 0
759 && (prog->check_utf8 == prog->float_utf8)
760 ) : (
761 prog->check_substr /* Could be deleted already */
762 && --BmUSEFUL(prog->check_substr) < 0
763 && (prog->check_substr == prog->float_substr)
764 )))
66e933ab 765 {
cad2e5aa 766 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 767 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
33b8afdf
JH
768 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
769 if (do_utf8 ? prog->check_substr : prog->check_utf8)
770 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
771 prog->check_substr = prog->check_utf8 = NULL; /* disable */
772 prog->float_substr = prog->float_utf8 = NULL; /* clear */
773 check = NULL; /* abort */
cad2e5aa 774 s = strpos;
3cf5c195
IZ
775 /* XXXX This is a remnant of the old implementation. It
776 looks wasteful, since now INTUIT can use many
6eb5f6b9 777 other heuristics. */
cad2e5aa
JH
778 prog->reganch &= ~RE_USE_INTUIT;
779 }
780 else
781 s = strpos;
782 }
783
6eb5f6b9
JH
784 /* Last resort... */
785 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
07be1b83 786 if (prog->regstclass && OP(prog->regstclass)!=TRIE) {
6eb5f6b9
JH
787 /* minlen == 0 is possible if regstclass is \b or \B,
788 and the fixed substr is ''$.
789 Since minlen is already taken into account, s+1 is before strend;
790 accidentally, minlen >= 1 guaranties no false positives at s + 1
791 even for \b or \B. But (minlen? 1 : 0) below assumes that
792 regstclass does not come from lookahead... */
793 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
794 This leaves EXACTF only, which is dealt with in find_byclass(). */
890ce7af 795 const U8* const str = (U8*)STRING(prog->regstclass);
3dab1dad 796 const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
1aa99e6b 797 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
66e933ab 798 : 1);
07be1b83 799 const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1aa99e6b 800 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
33b8afdf 801 : (prog->float_substr || prog->float_utf8
1aa99e6b
IH
802 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
803 cl_l, strend)
804 : strend);
07be1b83
YO
805 /*if (OP(prog->regstclass) == TRIE)
806 endpos++;*/
6eb5f6b9 807 t = s;
3b0527fe 808 s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
6eb5f6b9
JH
809 if (!s) {
810#ifdef DEBUGGING
cbbf8932 811 const char *what = NULL;
6eb5f6b9
JH
812#endif
813 if (endpos == strend) {
a3621e74 814 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
815 "Could not match STCLASS...\n") );
816 goto fail;
817 }
a3621e74 818 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 819 "This position contradicts STCLASS...\n") );
653099ff
GS
820 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
821 goto fail;
6eb5f6b9 822 /* Contradict one of substrings */
33b8afdf
JH
823 if (prog->anchored_substr || prog->anchored_utf8) {
824 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 825 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 826 hop_and_restart:
1aa99e6b 827 s = HOP3c(t, 1, strend);
66e933ab
GS
828 if (s + start_shift + end_shift > strend) {
829 /* XXXX Should be taken into account earlier? */
a3621e74 830 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
831 "Could not match STCLASS...\n") );
832 goto fail;
833 }
5e39e1e5
HS
834 if (!check)
835 goto giveup;
a3621e74 836 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 837 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
838 what, (long)(s + start_shift - i_strpos)) );
839 goto restart;
840 }
66e933ab 841 /* Have both, check_string is floating */
6eb5f6b9
JH
842 if (t + start_shift >= check_at) /* Contradicts floating=check */
843 goto retry_floating_check;
844 /* Recheck anchored substring, but not floating... */
9041c2e3 845 s = check_at;
5e39e1e5
HS
846 if (!check)
847 goto giveup;
a3621e74 848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 849 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
850 (long)(other_last - i_strpos)) );
851 goto do_other_anchored;
852 }
60e71179
GS
853 /* Another way we could have checked stclass at the
854 current position only: */
855 if (ml_anch) {
856 s = t = t + 1;
5e39e1e5
HS
857 if (!check)
858 goto giveup;
a3621e74 859 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 860 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 861 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 862 goto try_at_offset;
66e933ab 863 }
33b8afdf 864 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 865 goto fail;
6eb5f6b9
JH
866 /* Check is floating subtring. */
867 retry_floating_check:
868 t = check_at - start_shift;
a3621e74 869 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
870 goto hop_and_restart;
871 }
b7953727 872 if (t != s) {
a3621e74 873 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 874 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
875 (long)(t - i_strpos), (long)(s - i_strpos))
876 );
877 }
878 else {
a3621e74 879 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
880 "Does not contradict STCLASS...\n");
881 );
882 }
6eb5f6b9 883 }
5e39e1e5 884 giveup:
a3621e74 885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
886 PL_colors[4], (check ? "Guessed" : "Giving up"),
887 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 888 return s;
2c2d71f5
JH
889
890 fail_finish: /* Substring not found */
33b8afdf
JH
891 if (prog->check_substr || prog->check_utf8) /* could be removed already */
892 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 893 fail:
a3621e74 894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 895 PL_colors[4], PL_colors[5]));
bd61b366 896 return NULL;
cad2e5aa 897}
9661b544 898
6eb5f6b9 899/* We know what class REx starts with. Try to find this position... */
3b0527fe 900/* if reginfo is NULL, its a dryrun */
07be1b83
YO
901/* annoyingly all the vars in this routine have different names from their counterparts
902 in regmatch. /grrr */
3b0527fe 903
4cadc6a9
YO
904#define REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len, uvc, charid, \
905foldlen, foldbuf, uniflags) STMT_START { \
906 switch (trie_type) { \
907 case trie_utf8_fold: \
908 if ( foldlen>0 ) { \
909 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
910 foldlen -= len; \
911 uscan += len; \
912 len=0; \
913 } else { \
914 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
915 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
916 foldlen -= UNISKIP( uvc ); \
917 uscan = foldbuf + UNISKIP( uvc ); \
918 } \
919 break; \
920 case trie_utf8: \
921 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
922 break; \
923 case trie_plain: \
924 uvc = (UV)*uc; \
925 len = 1; \
926 } \
927 \
928 if (uvc < 256) { \
929 charid = trie->charmap[ uvc ]; \
930 } \
931 else { \
932 charid = 0; \
933 if (trie->widecharmap) { \
934 SV** const svpp = hv_fetch(trie->widecharmap, \
935 (char*)&uvc, sizeof(UV), 0); \
936 if (svpp) \
937 charid = (U16)SvIV(*svpp); \
938 } \
939 } \
940} STMT_END
941
942#define REXEC_FBC_EXACTISH_CHECK(CoNd) \
943 if ( (CoNd) \
944 && (ln == len || \
945 ibcmp_utf8(s, NULL, 0, do_utf8, \
946 m, NULL, ln, (bool)UTF)) \
947 && (!reginfo || regtry(reginfo, s)) ) \
948 goto got_it; \
949 else { \
950 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
951 uvchr_to_utf8(tmpbuf, c); \
952 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
953 if ( f != c \
954 && (f == c1 || f == c2) \
955 && (ln == foldlen || \
956 !ibcmp_utf8((char *) foldbuf, \
957 NULL, foldlen, do_utf8, \
958 m, \
959 NULL, ln, (bool)UTF)) \
960 && (!reginfo || regtry(reginfo, s)) ) \
961 goto got_it; \
962 } \
963 s += len
964
965#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
966STMT_START { \
967 while (s <= e) { \
968 if ( (CoNd) \
969 && (ln == 1 || !(OP(c) == EXACTF \
970 ? ibcmp(s, m, ln) \
971 : ibcmp_locale(s, m, ln))) \
972 && (!reginfo || regtry(reginfo, s)) ) \
973 goto got_it; \
974 s++; \
975 } \
976} STMT_END
977
978#define REXEC_FBC_UTF8_SCAN(CoDe) \
979STMT_START { \
980 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
981 CoDe \
982 s += uskip; \
983 } \
984} STMT_END
985
986#define REXEC_FBC_SCAN(CoDe) \
987STMT_START { \
988 while (s < strend) { \
989 CoDe \
990 s++; \
991 } \
992} STMT_END
993
994#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
995REXEC_FBC_UTF8_SCAN( \
996 if (CoNd) { \
997 if (tmp && (!reginfo || regtry(reginfo, s))) \
998 goto got_it; \
999 else \
1000 tmp = doevery; \
1001 } \
1002 else \
1003 tmp = 1; \
1004)
1005
1006#define REXEC_FBC_CLASS_SCAN(CoNd) \
1007REXEC_FBC_SCAN( \
1008 if (CoNd) { \
1009 if (tmp && (!reginfo || regtry(reginfo, s))) \
1010 goto got_it; \
1011 else \
1012 tmp = doevery; \
1013 } \
1014 else \
1015 tmp = 1; \
1016)
1017
1018#define REXEC_FBC_TRYIT \
1019if ((!reginfo || regtry(reginfo, s))) \
1020 goto got_it
1021
1022#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1023 if (do_utf8) { \
1024 UtFpReLoAd; \
1025 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1026 } \
1027 else { \
1028 REXEC_FBC_CLASS_SCAN(CoNd); \
1029 } \
1030 break
1031
1032#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1033 PL_reg_flags |= RF_tainted; \
1034 if (do_utf8) { \
1035 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1036 } \
1037 else { \
1038 REXEC_FBC_CLASS_SCAN(CoNd); \
1039 } \
1040 break
1041
3c3eec57 1042STATIC char *
07be1b83
YO
1043S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1044 const char *strend, const regmatch_info *reginfo)
a687059c 1045{
27da23d5 1046 dVAR;
1df70142 1047 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
6eb5f6b9 1048 char *m;
d8093b23 1049 STRLEN ln;
5dab1207 1050 STRLEN lnc;
078c425b 1051 register STRLEN uskip;
d8093b23 1052 unsigned int c1;
1053 unsigned int c2;
6eb5f6b9
JH
1054 char *e;
1055 register I32 tmp = 1; /* Scratch variable? */
a3b680e6 1056 register const bool do_utf8 = PL_reg_match_utf8;
cad2e5aa 1057
6eb5f6b9
JH
1058 /* We know what class it must start with. */
1059 switch (OP(c)) {
6eb5f6b9 1060 case ANYOF:
388cc4de 1061 if (do_utf8) {
4cadc6a9 1062 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
388cc4de 1063 !UTF8_IS_INVARIANT((U8)s[0]) ?
32fc9b6a 1064 reginclass(prog, c, (U8*)s, 0, do_utf8) :
4cadc6a9 1065 REGINCLASS(prog, c, (U8*)s));
388cc4de
HS
1066 }
1067 else {
1068 while (s < strend) {
1069 STRLEN skip = 1;
1070
32fc9b6a 1071 if (REGINCLASS(prog, c, (U8*)s) ||
388cc4de
HS
1072 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1073 /* The assignment of 2 is intentional:
1074 * for the folded sharp s, the skip is 2. */
1075 (skip = SHARP_S_SKIP))) {
3b0527fe 1076 if (tmp && (!reginfo || regtry(reginfo, s)))
388cc4de
HS
1077 goto got_it;
1078 else
1079 tmp = doevery;
1080 }
1081 else
1082 tmp = 1;
1083 s += skip;
1084 }
a0d0e21e 1085 }
6eb5f6b9 1086 break;
f33976b4 1087 case CANY:
4cadc6a9 1088 REXEC_FBC_SCAN(
3b0527fe 1089 if (tmp && (!reginfo || regtry(reginfo, s)))
f33976b4
DB
1090 goto got_it;
1091 else
1092 tmp = doevery;
4cadc6a9 1093 );
f33976b4 1094 break;
6eb5f6b9 1095 case EXACTF:
5dab1207
NIS
1096 m = STRING(c);
1097 ln = STR_LEN(c); /* length to match in octets/bytes */
1098 lnc = (I32) ln; /* length to match in characters */
1aa99e6b 1099 if (UTF) {
a2a2844f 1100 STRLEN ulen1, ulen2;
5dab1207 1101 U8 *sm = (U8 *) m;
89ebb4a3
JH
1102 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1103 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4ad0818d 1104 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a2a2844f
JH
1105
1106 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1107 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1108
89ebb4a3 1109 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
041457d9 1110 0, uniflags);
89ebb4a3 1111 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
041457d9 1112 0, uniflags);
5dab1207
NIS
1113 lnc = 0;
1114 while (sm < ((U8 *) m + ln)) {
1115 lnc++;
1116 sm += UTF8SKIP(sm);
1117 }
1aa99e6b
IH
1118 }
1119 else {
1120 c1 = *(U8*)m;
1121 c2 = PL_fold[c1];
1122 }
6eb5f6b9
JH
1123 goto do_exactf;
1124 case EXACTFL:
5dab1207
NIS
1125 m = STRING(c);
1126 ln = STR_LEN(c);
1127 lnc = (I32) ln;
d8093b23 1128 c1 = *(U8*)m;
6eb5f6b9
JH
1129 c2 = PL_fold_locale[c1];
1130 do_exactf:
db12adc6 1131 e = HOP3c(strend, -((I32)lnc), s);
b3c9acc1 1132
3b0527fe 1133 if (!reginfo && e < s)
6eb5f6b9 1134 e = s; /* Due to minlen logic of intuit() */
1aa99e6b 1135
60a8b682
JH
1136 /* The idea in the EXACTF* cases is to first find the
1137 * first character of the EXACTF* node and then, if
1138 * necessary, case-insensitively compare the full
1139 * text of the node. The c1 and c2 are the first
1140 * characters (though in Unicode it gets a bit
1141 * more complicated because there are more cases
7f16dd3d
JH
1142 * than just upper and lower: one needs to use
1143 * the so-called folding case for case-insensitive
1144 * matching (called "loose matching" in Unicode).
1145 * ibcmp_utf8() will do just that. */
60a8b682 1146
1aa99e6b 1147 if (do_utf8) {
575cac57 1148 UV c, f;
89ebb4a3 1149 U8 tmpbuf [UTF8_MAXBYTES+1];
575cac57 1150 STRLEN len, foldlen;
4ad0818d 1151 const U32 uniflags = UTF8_ALLOW_DEFAULT;
09091399 1152 if (c1 == c2) {
5dab1207
NIS
1153 /* Upper and lower of 1st char are equal -
1154 * probably not a "letter". */
1aa99e6b 1155 while (s <= e) {
89ebb4a3 1156 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1157 uniflags);
4cadc6a9 1158 REXEC_FBC_EXACTISH_CHECK(c == c1);
1aa99e6b 1159 }
09091399
JH
1160 }
1161 else {
1aa99e6b 1162 while (s <= e) {
89ebb4a3 1163 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
041457d9 1164 uniflags);
80aecb99 1165
60a8b682 1166 /* Handle some of the three Greek sigmas cases.
8c01da3c
JH
1167 * Note that not all the possible combinations
1168 * are handled here: some of them are handled
1169 * by the standard folding rules, and some of
1170 * them (the character class or ANYOF cases)
1171 * are handled during compiletime in
1172 * regexec.c:S_regclass(). */
880bd946
JH
1173 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1174 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1175 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
80aecb99 1176
4cadc6a9 1177 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1aa99e6b 1178 }
09091399 1179 }
1aa99e6b
IH
1180 }
1181 else {
1182 if (c1 == c2)
4cadc6a9 1183 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1aa99e6b 1184 else
4cadc6a9 1185 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
b3c9acc1
IZ
1186 }
1187 break;
bbce6d69 1188 case BOUNDL:
3280af22 1189 PL_reg_flags |= RF_tainted;
bbce6d69 1190 /* FALL THROUGH */
a0d0e21e 1191 case BOUND:
ffc61ed2 1192 if (do_utf8) {
12d33761 1193 if (s == PL_bostr)
ffc61ed2
JH
1194 tmp = '\n';
1195 else {
6136c704 1196 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1197 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1198 }
1199 tmp = ((OP(c) == BOUND ?
9041c2e3 1200 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1201 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1202 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1203 if (tmp == !(OP(c) == BOUND ?
bb7a0f54 1204 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1205 isALNUM_LC_utf8((U8*)s)))
1206 {
1207 tmp = !tmp;
4cadc6a9 1208 REXEC_FBC_TRYIT;
a687059c 1209 }
4cadc6a9 1210 );
a0d0e21e 1211 }
667bb95a 1212 else {
12d33761 1213 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2 1214 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1215 REXEC_FBC_SCAN(
ffc61ed2
JH
1216 if (tmp ==
1217 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1218 tmp = !tmp;
4cadc6a9 1219 REXEC_FBC_TRYIT;
a0ed51b3 1220 }
4cadc6a9 1221 );
a0ed51b3 1222 }
3b0527fe 1223 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1224 goto got_it;
1225 break;
bbce6d69 1226 case NBOUNDL:
3280af22 1227 PL_reg_flags |= RF_tainted;
bbce6d69 1228 /* FALL THROUGH */
a0d0e21e 1229 case NBOUND:
ffc61ed2 1230 if (do_utf8) {
12d33761 1231 if (s == PL_bostr)
ffc61ed2
JH
1232 tmp = '\n';
1233 else {
6136c704 1234 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
4ad0818d 1235 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
1236 }
1237 tmp = ((OP(c) == NBOUND ?
9041c2e3 1238 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1a4fad37 1239 LOAD_UTF8_CHARCLASS_ALNUM();
4cadc6a9 1240 REXEC_FBC_UTF8_SCAN(
ffc61ed2 1241 if (tmp == !(OP(c) == NBOUND ?
bb7a0f54 1242 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
ffc61ed2
JH
1243 isALNUM_LC_utf8((U8*)s)))
1244 tmp = !tmp;
4cadc6a9
YO
1245 else REXEC_FBC_TRYIT;
1246 );
a0d0e21e 1247 }
667bb95a 1248 else {
12d33761 1249 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
ffc61ed2
JH
1250 tmp = ((OP(c) == NBOUND ?
1251 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
4cadc6a9 1252 REXEC_FBC_SCAN(
ffc61ed2
JH
1253 if (tmp ==
1254 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1255 tmp = !tmp;
4cadc6a9
YO
1256 else REXEC_FBC_TRYIT;
1257 );
a0ed51b3 1258 }
3b0527fe 1259 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, s)))
a0ed51b3
LW
1260 goto got_it;
1261 break;
a0d0e21e 1262 case ALNUM:
4cadc6a9
YO
1263 REXEC_FBC_CSCAN_PRELOAD(
1264 LOAD_UTF8_CHARCLASS_ALNUM(),
1265 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1266 isALNUM(*s)
1267 );
bbce6d69 1268 case ALNUML:
4cadc6a9
YO
1269 REXEC_FBC_CSCAN_TAINT(
1270 isALNUM_LC_utf8((U8*)s),
1271 isALNUM_LC(*s)
1272 );
a0d0e21e 1273 case NALNUM:
4cadc6a9
YO
1274 REXEC_FBC_CSCAN_PRELOAD(
1275 LOAD_UTF8_CHARCLASS_ALNUM(),
1276 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1277 !isALNUM(*s)
1278 );
bbce6d69 1279 case NALNUML:
4cadc6a9
YO
1280 REXEC_FBC_CSCAN_TAINT(
1281 !isALNUM_LC_utf8((U8*)s),
1282 !isALNUM_LC(*s)
1283 );
a0d0e21e 1284 case SPACE:
4cadc6a9
YO
1285 REXEC_FBC_CSCAN_PRELOAD(
1286 LOAD_UTF8_CHARCLASS_SPACE(),
1287 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1288 isSPACE(*s)
1289 );
bbce6d69 1290 case SPACEL:
4cadc6a9
YO
1291 REXEC_FBC_CSCAN_TAINT(
1292 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1293 isSPACE_LC(*s)
1294 );
a0d0e21e 1295 case NSPACE:
4cadc6a9
YO
1296 REXEC_FBC_CSCAN_PRELOAD(
1297 LOAD_UTF8_CHARCLASS_SPACE(),
1298 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1299 !isSPACE(*s)
1300 );
bbce6d69 1301 case NSPACEL:
4cadc6a9
YO
1302 REXEC_FBC_CSCAN_TAINT(
1303 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1304 !isSPACE_LC(*s)
1305 );
a0d0e21e 1306 case DIGIT:
4cadc6a9
YO
1307 REXEC_FBC_CSCAN_PRELOAD(
1308 LOAD_UTF8_CHARCLASS_DIGIT(),
1309 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1310 isDIGIT(*s)
1311 );
b8c5462f 1312 case DIGITL:
4cadc6a9
YO
1313 REXEC_FBC_CSCAN_TAINT(
1314 isDIGIT_LC_utf8((U8*)s),
1315 isDIGIT_LC(*s)
1316 );
a0d0e21e 1317 case NDIGIT:
4cadc6a9
YO
1318 REXEC_FBC_CSCAN_PRELOAD(
1319 LOAD_UTF8_CHARCLASS_DIGIT(),
1320 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1321 !isDIGIT(*s)
1322 );
b8c5462f 1323 case NDIGITL:
4cadc6a9
YO
1324 REXEC_FBC_CSCAN_TAINT(
1325 !isDIGIT_LC_utf8((U8*)s),
1326 !isDIGIT_LC(*s)
1327 );
07be1b83
YO
1328 case TRIE:
1329 /*Perl_croak(aTHX_ "panic: unknown regstclass TRIE");*/
1330 {
1331 const enum { trie_plain, trie_utf8, trie_utf8_fold }
1332 trie_type = do_utf8 ?
1333 (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
1334 : trie_plain;
1335 /* what trie are we using right now */
1336 reg_ac_data *aho
1337 = (reg_ac_data*)prog->data->data[ ARG( c ) ];
1338 reg_trie_data *trie=aho->trie;
1339
1340 const char *last_start = strend - trie->minlen;
6148ee25 1341#ifdef DEBUGGING
07be1b83 1342 const char *real_start = s;
6148ee25 1343#endif
07be1b83 1344 STRLEN maxlen = trie->maxlen;
be8e71aa
YO
1345 SV *sv_points;
1346 U8 **points; /* map of where we were in the input string
1347 when reading a given string. For ASCII this
1348 is unnecessary overhead as the relationship
1349 is always 1:1, but for unicode, especially
1350 case folded unicode this is not true. */
f9e705e8 1351 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
07be1b83
YO
1352
1353 GET_RE_DEBUG_FLAGS_DECL;
1354
be8e71aa
YO
1355 /* We can't just allocate points here. We need to wrap it in
1356 * an SV so it gets freed properly if there is a croak while
1357 * running the match */
1358 ENTER;
1359 SAVETMPS;
1360 sv_points=newSV(maxlen * sizeof(U8 *));
1361 SvCUR_set(sv_points,
1362 maxlen * sizeof(U8 *));
1363 SvPOK_on(sv_points);
1364 sv_2mortal(sv_points);
1365 points=(U8**)SvPV_nolen(sv_points );
07be1b83
YO
1366
1367 if (trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1368 while (s <= last_start && !TRIE_BITMAP_TEST(trie,*s) ) {
07be1b83
YO
1369 s++;
1370 }
1371 }
1372
1373 while (s <= last_start) {
1374 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1375 U8 *uc = (U8*)s;
1376 U16 charid = 0;
1377 U32 base = 1;
1378 U32 state = 1;
1379 UV uvc = 0;
1380 STRLEN len = 0;
1381 STRLEN foldlen = 0;
1382 U8 *uscan = (U8*)NULL;
1383 U8 *leftmost = NULL;
1384
1385 U32 pointpos = 0;
1386
1387 while ( state && uc <= (U8*)strend ) {
1388 int failed=0;
1389 if (aho->states[ state ].wordnum) {
1390 U8 *lpos= points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1] ) % maxlen ];
1391 if (!leftmost || lpos < leftmost)
1392 leftmost= lpos;
1393 if (base==0) break;
1394 }
1395 points[pointpos++ % maxlen]= uc;
4cadc6a9
YO
1396 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
1397 uvc, charid, foldlen, foldbuf, uniflags);
07be1b83
YO
1398 DEBUG_TRIE_EXECUTE_r(
1399 PerlIO_printf(Perl_debug_log,
1400 "Pos: %d Charid:%3x CV:%4"UVxf" ",
1401 (int)((const char*)uc - real_start), charid, uvc)
1402 );
1403 uc += len;
1404
1405 do {
6148ee25 1406#ifdef DEBUGGING
07be1b83 1407 U32 word = aho->states[ state ].wordnum;
6148ee25 1408#endif
07be1b83
YO
1409 base = aho->states[ state ].trans.base;
1410
1411 DEBUG_TRIE_EXECUTE_r(
1412 PerlIO_printf( Perl_debug_log,
1413 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf" word=%"UVxf"\n",
1414 failed ? "Fail transition to " : "",
1415 state, base, uvc, word)
1416 );
1417 if ( base ) {
1418 U32 tmp;
1419 if (charid &&
1420 (base + charid > trie->uniquecharcount )
1421 && (base + charid - 1 - trie->uniquecharcount
1422 < trie->lasttrans)
1423 && trie->trans[base + charid - 1 -
1424 trie->uniquecharcount].check == state
1425 && (tmp=trie->trans[base + charid - 1 -
1426 trie->uniquecharcount ].next))
1427 {
1428 state = tmp;
1429 break;
1430 }
1431 else {
1432 failed++;
1433 if ( state == 1 )
1434 break;
1435 else
1436 state = aho->fail[state];
1437 }
1438 }
1439 else {
1440 /* we must be accepting here */
1441 failed++;
1442 break;
1443 }
1444 } while(state);
1445 if (failed) {
1446 if (leftmost)
1447 break;
1448 else if (!charid && trie->bitmap && trie_type != trie_utf8_fold) {
13a24bad 1449 while ( uc <= (U8*)last_start && !TRIE_BITMAP_TEST(trie,*uc) ) {
07be1b83
YO
1450 uc++;
1451 }
1452 }
1453 }
1454 }
1455 if ( aho->states[ state ].wordnum ) {
1456 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1457 if (!leftmost || lpos < leftmost)
1458 leftmost = lpos;
1459 }
1460 DEBUG_TRIE_EXECUTE_r(
1461 PerlIO_printf( Perl_debug_log,
1462 "%sState: %4"UVxf", Base: 0x%-4"UVxf" uvc=%"UVxf"\n",
1463 "All done: ",
1464 state, base, uvc)
1465 );
1466 if (leftmost) {
1467 s = (char*)leftmost;
be8e71aa
YO
1468 if (!reginfo || regtry(reginfo, s)) {
1469 FREETMPS;
1470 LEAVE;
07be1b83 1471 goto got_it;
be8e71aa 1472 }
07be1b83
YO
1473 s = HOPc(s,1);
1474 } else {
1475 break;
1476 }
1477 }
be8e71aa
YO
1478 FREETMPS;
1479 LEAVE;
07be1b83
YO
1480 }
1481 break;
b3c9acc1 1482 default:
3c3eec57
GS
1483 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1484 break;
d6a28714 1485 }
6eb5f6b9
JH
1486 return 0;
1487 got_it:
1488 return s;
1489}
1490
1491/*
1492 - regexec_flags - match a regexp against a string
1493 */
1494I32
1495Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1496 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1497/* strend: pointer to null at end of string */
1498/* strbeg: real beginning of string */
1499/* minend: end of match must be >=minend after stringarg. */
1500/* data: May be used for some additional optimizations. */
1501/* nosave: For optimizations. */
1502{
97aff369 1503 dVAR;
6eb5f6b9
JH
1504 register char *s;
1505 register regnode *c;
1506 register char *startpos = stringarg;
6eb5f6b9
JH
1507 I32 minlen; /* must match at least this many chars */
1508 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
1509 I32 end_shift = 0; /* Same for the end. */ /* CC */
1510 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 1511 char *scream_olds = NULL;
3dab1dad 1512 SV* const oreplsv = GvSV(PL_replgv);
1df70142 1513 const bool do_utf8 = DO_UTF8(sv);
2757e526 1514 I32 multiline;
0df25f3d 1515
3b0527fe 1516 regmatch_info reginfo; /* create some info to pass to regtry etc */
a3621e74
YO
1517
1518 GET_RE_DEBUG_FLAGS_DECL;
1519
9d4ba2ae 1520 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
1521
1522 /* Be paranoid... */
1523 if (prog == NULL || startpos == NULL) {
1524 Perl_croak(aTHX_ "NULL regexp parameter");
1525 return 0;
1526 }
1527
2757e526 1528 multiline = prog->reganch & PMf_MULTILINE;
3b0527fe 1529 reginfo.prog = prog;
2757e526 1530
bac06658
JH
1531 RX_MATCH_UTF8_set(prog, do_utf8);
1532
6eb5f6b9 1533 minlen = prog->minlen;
61a36c01 1534 if (strend - startpos < minlen) {
a3621e74 1535 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
1536 "String too short [regexec_flags]...\n"));
1537 goto phooey;
1aa99e6b 1538 }
6eb5f6b9 1539
6eb5f6b9
JH
1540 /* Check validity of program. */
1541 if (UCHARAT(prog->program) != REG_MAGIC) {
1542 Perl_croak(aTHX_ "corrupted regexp program");
1543 }
1544
1545 PL_reg_flags = 0;
1546 PL_reg_eval_set = 0;
1547 PL_reg_maxiter = 0;
1548
1549 if (prog->reganch & ROPT_UTF8)
1550 PL_reg_flags |= RF_utf8;
1551
1552 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 1553 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 1554 PL_bostr = strbeg;
3b0527fe 1555 reginfo.sv = sv;
6eb5f6b9
JH
1556
1557 /* Mark end of line for $ (and such) */
1558 PL_regeol = strend;
1559
1560 /* see how far we have to get to not match where we matched before */
3b0527fe 1561 reginfo.till = startpos+minend;
6eb5f6b9 1562
6eb5f6b9
JH
1563 /* If there is a "must appear" string, look for it. */
1564 s = startpos;
1565
3b0527fe 1566 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9
JH
1567 MAGIC *mg;
1568
1569 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
3b0527fe 1570 reginfo.ganch = startpos;
6eb5f6b9
JH
1571 else if (sv && SvTYPE(sv) >= SVt_PVMG
1572 && SvMAGIC(sv)
14befaf4
DM
1573 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1574 && mg->mg_len >= 0) {
3b0527fe 1575 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
6eb5f6b9 1576 if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1577 if (s > reginfo.ganch)
6eb5f6b9 1578 goto phooey;
3b0527fe 1579 s = reginfo.ganch;
6eb5f6b9
JH
1580 }
1581 }
1582 else /* pos() not defined */
3b0527fe 1583 reginfo.ganch = strbeg;
6eb5f6b9
JH
1584 }
1585
a0714e2c 1586 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
1587 re_scream_pos_data d;
1588
1589 d.scream_olds = &scream_olds;
1590 d.scream_pos = &scream_pos;
1591 s = re_intuit_start(prog, sv, s, strend, flags, &d);
3fa9c3d7 1592 if (!s) {
a3621e74 1593 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 1594 goto phooey; /* not present */
3fa9c3d7 1595 }
6eb5f6b9
JH
1596 }
1597
ab3bbdeb
YO
1598 DEBUG_EXECUTE_r(
1599 debug_start_match(prog, do_utf8, startpos, strend,
1600 "Matching");
2a782b5b 1601 );
6eb5f6b9
JH
1602
1603 /* Simplest case: anchored match need be tried only once. */
1604 /* [unless only anchor is BOL and multiline is set] */
1605 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
3b0527fe 1606 if (s == startpos && regtry(&reginfo, startpos))
6eb5f6b9 1607 goto got_it;
7fba1cd6 1608 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
6eb5f6b9
JH
1609 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1610 {
1611 char *end;
1612
1613 if (minlen)
1614 dontbother = minlen - 1;
1aa99e6b 1615 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 1616 /* for multiline we only have to try after newlines */
33b8afdf 1617 if (prog->check_substr || prog->check_utf8) {
6eb5f6b9
JH
1618 if (s == startpos)
1619 goto after_try;
1620 while (1) {
3b0527fe 1621 if (regtry(&reginfo, s))
6eb5f6b9
JH
1622 goto got_it;
1623 after_try:
1624 if (s >= end)
1625 goto phooey;
1626 if (prog->reganch & RE_USE_INTUIT) {
1627 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1628 if (!s)
1629 goto phooey;
1630 }
1631 else
1632 s++;
1633 }
1634 } else {
1635 if (s > startpos)
1636 s--;
1637 while (s < end) {
1638 if (*s++ == '\n') { /* don't need PL_utf8skip here */
3b0527fe 1639 if (regtry(&reginfo, s))
6eb5f6b9
JH
1640 goto got_it;
1641 }
1642 }
1643 }
1644 }
1645 goto phooey;
1646 } else if (prog->reganch & ROPT_ANCH_GPOS) {
3b0527fe 1647 if (regtry(&reginfo, reginfo.ganch))
6eb5f6b9
JH
1648 goto got_it;
1649 goto phooey;
1650 }
1651
1652 /* Messy cases: unanchored match. */
33b8afdf 1653 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
6eb5f6b9
JH
1654 /* we have /x+whatever/ */
1655 /* it must be a one character string (XXXX Except UTF?) */
33b8afdf 1656 char ch;
bf93d4cc
GS
1657#ifdef DEBUGGING
1658 int did_match = 0;
1659#endif
33b8afdf
JH
1660 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1661 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
3f7c398e 1662 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
bf93d4cc 1663
1aa99e6b 1664 if (do_utf8) {
4cadc6a9 1665 REXEC_FBC_SCAN(
6eb5f6b9 1666 if (*s == ch) {
a3621e74 1667 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1668 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1669 s += UTF8SKIP(s);
1670 while (s < strend && *s == ch)
1671 s += UTF8SKIP(s);
1672 }
4cadc6a9 1673 );
6eb5f6b9
JH
1674 }
1675 else {
4cadc6a9 1676 REXEC_FBC_SCAN(
6eb5f6b9 1677 if (*s == ch) {
a3621e74 1678 DEBUG_EXECUTE_r( did_match = 1 );
3b0527fe 1679 if (regtry(&reginfo, s)) goto got_it;
6eb5f6b9
JH
1680 s++;
1681 while (s < strend && *s == ch)
1682 s++;
1683 }
4cadc6a9 1684 );
6eb5f6b9 1685 }
a3621e74 1686 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 1687 PerlIO_printf(Perl_debug_log,
b7953727
JH
1688 "Did not find anchored character...\n")
1689 );
6eb5f6b9 1690 }
a0714e2c
SS
1691 else if (prog->anchored_substr != NULL
1692 || prog->anchored_utf8 != NULL
1693 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
1694 && prog->float_max_offset < strend - s)) {
1695 SV *must;
1696 I32 back_max;
1697 I32 back_min;
1698 char *last;
6eb5f6b9 1699 char *last1; /* Last position checked before */
bf93d4cc
GS
1700#ifdef DEBUGGING
1701 int did_match = 0;
1702#endif
33b8afdf
JH
1703 if (prog->anchored_substr || prog->anchored_utf8) {
1704 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1705 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1706 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1707 back_max = back_min = prog->anchored_offset;
1708 } else {
1709 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1710 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1711 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1712 back_max = prog->float_max_offset;
1713 back_min = prog->float_min_offset;
1714 }
1715 if (must == &PL_sv_undef)
1716 /* could not downgrade utf8 check substring, so must fail */
1717 goto phooey;
1718
1719 last = HOP3c(strend, /* Cannot start after this */
1720 -(I32)(CHR_SVLEN(must)
1721 - (SvTAIL(must) != 0) + back_min), strbeg);
6eb5f6b9
JH
1722
1723 if (s > PL_bostr)
1724 last1 = HOPc(s, -1);
1725 else
1726 last1 = s - 1; /* bogus */
1727
a0288114 1728 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
1729 check_substr==must. */
1730 scream_pos = -1;
1731 dontbother = end_shift;
1732 strend = HOPc(strend, -dontbother);
1733 while ( (s <= last) &&
9041c2e3 1734 ((flags & REXEC_SCREAM)
1aa99e6b 1735 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
6eb5f6b9 1736 end_shift, &scream_pos, 0))
1aa99e6b 1737 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
9041c2e3 1738 (unsigned char*)strend, must,
7fba1cd6 1739 multiline ? FBMrf_MULTILINE : 0))) ) {
4addbd3b
HS
1740 /* we may be pointing at the wrong string */
1741 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
3f7c398e 1742 s = strbeg + (s - SvPVX_const(sv));
a3621e74 1743 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
1744 if (HOPc(s, -back_max) > last1) {
1745 last1 = HOPc(s, -back_min);
1746 s = HOPc(s, -back_max);
1747 }
1748 else {
52657f30 1749 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
1750
1751 last1 = HOPc(s, -back_min);
52657f30 1752 s = t;
6eb5f6b9 1753 }
1aa99e6b 1754 if (do_utf8) {
6eb5f6b9 1755 while (s <= last1) {
3b0527fe 1756 if (regtry(&reginfo, s))
6eb5f6b9
JH
1757 goto got_it;
1758 s += UTF8SKIP(s);
1759 }
1760 }
1761 else {
1762 while (s <= last1) {
3b0527fe 1763 if (regtry(&reginfo, s))
6eb5f6b9
JH
1764 goto got_it;
1765 s++;
1766 }
1767 }
1768 }
ab3bbdeb
YO
1769 DEBUG_EXECUTE_r(if (!did_match) {
1770 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
1771 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1772 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 1773 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 1774 ? "anchored" : "floating"),
ab3bbdeb
YO
1775 quoted, RE_SV_TAIL(must));
1776 });
6eb5f6b9
JH
1777 goto phooey;
1778 }
155aba94 1779 else if ((c = prog->regstclass)) {
f14c76ed 1780 if (minlen) {
be8e71aa 1781 const OPCODE op = OP(prog->regstclass);
66e933ab 1782 /* don't bother with what can't match */
07be1b83 1783 if (PL_regkind[op] != EXACT && op != CANY && op != TRIE)
f14c76ed
RGS
1784 strend = HOPc(strend, -(minlen - 1));
1785 }
a3621e74 1786 DEBUG_EXECUTE_r({
be8e71aa 1787 SV * const prop = sv_newmortal();
32fc9b6a 1788 regprop(prog, prop, c);
0df25f3d 1789 {
ab3bbdeb
YO
1790 RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
1791 s,strend-s,60);
0df25f3d 1792 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
1793 "Matching stclass %.*s against %s (%d chars)\n",
1794 SvCUR(prop), SvPVX_const(prop),
1795 quoted, (int)(strend - s));
0df25f3d 1796 }
ffc61ed2 1797 });
3b0527fe 1798 if (find_byclass(prog, c, s, strend, &reginfo))
6eb5f6b9 1799 goto got_it;
07be1b83 1800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
1801 }
1802 else {
1803 dontbother = 0;
a0714e2c 1804 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 1805 /* Trim the end. */
d6a28714 1806 char *last;
33b8afdf
JH
1807 SV* float_real;
1808
1809 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1810 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1811 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
d6a28714
JH
1812
1813 if (flags & REXEC_SCREAM) {
33b8afdf 1814 last = screaminstr(sv, float_real, s - strbeg,
d6a28714
JH
1815 end_shift, &scream_pos, 1); /* last one */
1816 if (!last)
ffc61ed2 1817 last = scream_olds; /* Only one occurrence. */
4addbd3b
HS
1818 /* we may be pointing at the wrong string */
1819 else if (RX_MATCH_COPIED(prog))
3f7c398e 1820 s = strbeg + (s - SvPVX_const(sv));
b8c5462f 1821 }
d6a28714
JH
1822 else {
1823 STRLEN len;
cfd0369c 1824 const char * const little = SvPV_const(float_real, len);
d6a28714 1825
33b8afdf 1826 if (SvTAIL(float_real)) {
d6a28714
JH
1827 if (memEQ(strend - len + 1, little, len - 1))
1828 last = strend - len + 1;
7fba1cd6 1829 else if (!multiline)
9041c2e3 1830 last = memEQ(strend - len, little, len)
bd61b366 1831 ? strend - len : NULL;
b8c5462f 1832 else
d6a28714
JH
1833 goto find_last;
1834 } else {
1835 find_last:
9041c2e3 1836 if (len)
d6a28714 1837 last = rninstr(s, strend, little, little + len);
b8c5462f 1838 else
a0288114 1839 last = strend; /* matching "$" */
b8c5462f 1840 }
b8c5462f 1841 }
bf93d4cc 1842 if (last == NULL) {
a3621e74 1843 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
bf93d4cc 1844 "%sCan't trim the tail, match fails (should not happen)%s\n",
e4584336 1845 PL_colors[4], PL_colors[5]));
bf93d4cc
GS
1846 goto phooey; /* Should not happen! */
1847 }
d6a28714
JH
1848 dontbother = strend - last + prog->float_min_offset;
1849 }
1850 if (minlen && (dontbother < minlen))
1851 dontbother = minlen - 1;
1852 strend -= dontbother; /* this one's always in bytes! */
1853 /* We don't know much -- general case. */
1aa99e6b 1854 if (do_utf8) {
d6a28714 1855 for (;;) {
3b0527fe 1856 if (regtry(&reginfo, s))
d6a28714
JH
1857 goto got_it;
1858 if (s >= strend)
1859 break;
b8c5462f 1860 s += UTF8SKIP(s);
d6a28714
JH
1861 };
1862 }
1863 else {
1864 do {
3b0527fe 1865 if (regtry(&reginfo, s))
d6a28714
JH
1866 goto got_it;
1867 } while (s++ < strend);
1868 }
1869 }
1870
1871 /* Failure. */
1872 goto phooey;
1873
1874got_it:
1875 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1876
1877 if (PL_reg_eval_set) {
1878 /* Preserve the current value of $^R */
1879 if (oreplsv != GvSV(PL_replgv))
1880 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1881 restored, the value remains
1882 the same. */
4f639d21 1883 restore_pos(aTHX_ prog);
d6a28714
JH
1884 }
1885
1886 /* make sure $`, $&, $', and $digit will work later */
1887 if ( !(flags & REXEC_NOT_FIRST) ) {
ed252734 1888 RX_MATCH_COPY_FREE(prog);
d6a28714 1889 if (flags & REXEC_COPY_STR) {
be8e71aa 1890 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
f8c7b90f 1891#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
1892 if ((SvIsCOW(sv)
1893 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
1894 if (DEBUG_C_TEST) {
1895 PerlIO_printf(Perl_debug_log,
1896 "Copy on write: regexp capture, type %d\n",
1897 (int) SvTYPE(sv));
1898 }
1899 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 1900 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734
NC
1901 assert (SvPOKp(prog->saved_copy));
1902 } else
1903#endif
1904 {
1905 RX_MATCH_COPIED_on(prog);
1906 s = savepvn(strbeg, i);
1907 prog->subbeg = s;
1908 }
d6a28714 1909 prog->sublen = i;
d6a28714
JH
1910 }
1911 else {
1912 prog->subbeg = strbeg;
1913 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1914 }
1915 }
9041c2e3 1916
d6a28714
JH
1917 return 1;
1918
1919phooey:
a3621e74 1920 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 1921 PL_colors[4], PL_colors[5]));
d6a28714 1922 if (PL_reg_eval_set)
4f639d21 1923 restore_pos(aTHX_ prog);
d6a28714
JH
1924 return 0;
1925}
1926
1927/*
1928 - regtry - try match at specific point
1929 */
1930STATIC I32 /* 0 failure, 1 success */
3b0527fe 1931S_regtry(pTHX_ const regmatch_info *reginfo, char *startpos)
d6a28714 1932{
97aff369 1933 dVAR;
d6a28714
JH
1934 register I32 *sp;
1935 register I32 *ep;
1936 CHECKPOINT lastcp;
3b0527fe 1937 regexp *prog = reginfo->prog;
a3621e74 1938 GET_RE_DEBUG_FLAGS_DECL;
d6a28714 1939
02db2b7b
IZ
1940#ifdef DEBUGGING
1941 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1942#endif
d6a28714
JH
1943 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1944 MAGIC *mg;
1945
1946 PL_reg_eval_set = RS_init;
a3621e74 1947 DEBUG_EXECUTE_r(DEBUG_s(
b900a521
JH
1948 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1949 (IV)(PL_stack_sp - PL_stack_base));
d6a28714 1950 ));
e8347627 1951 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
d6a28714
JH
1952 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1953 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1954 SAVETMPS;
1955 /* Apparently this is not needed, judging by wantarray. */
e8347627 1956 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
d6a28714
JH
1957 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1958
3b0527fe 1959 if (reginfo->sv) {
d6a28714 1960 /* Make $_ available to executed code. */
3b0527fe 1961 if (reginfo->sv != DEFSV) {
59f00321 1962 SAVE_DEFSV;
3b0527fe 1963 DEFSV = reginfo->sv;
b8c5462f 1964 }
d6a28714 1965
3b0527fe
DM
1966 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
1967 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 1968 /* prepare for quick setting of pos */
d300d9fa
NC
1969#ifdef PERL_OLD_COPY_ON_WRITE
1970 if (SvIsCOW(sv))
1971 sv_force_normal_flags(sv, 0);
1972#endif
3dab1dad 1973 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 1974 &PL_vtbl_mglob, NULL, 0);
d6a28714 1975 mg->mg_len = -1;
b8c5462f 1976 }
d6a28714
JH
1977 PL_reg_magic = mg;
1978 PL_reg_oldpos = mg->mg_len;
4f639d21 1979 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 1980 }
09687e5a 1981 if (!PL_reg_curpm) {
a02a5408 1982 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
1983#ifdef USE_ITHREADS
1984 {
be8e71aa 1985 SV* const repointer = newSViv(0);
577e12cc 1986 /* so we know which PL_regex_padav element is PL_reg_curpm */
35061a7e 1987 SvFLAGS(repointer) |= SVf_BREAK;
09687e5a
AB
1988 av_push(PL_regex_padav,repointer);
1989 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1990 PL_regex_pad = AvARRAY(PL_regex_padav);
1991 }
1992#endif
1993 }
aaa362c4 1994 PM_SETRE(PL_reg_curpm, prog);
d6a28714
JH
1995 PL_reg_oldcurpm = PL_curpm;
1996 PL_curpm = PL_reg_curpm;
1997 if (RX_MATCH_COPIED(prog)) {
1998 /* Here is a serious problem: we cannot rewrite subbeg,
1999 since it may be needed if this match fails. Thus
2000 $` inside (?{}) could fail... */
2001 PL_reg_oldsaved = prog->subbeg;
2002 PL_reg_oldsavedlen = prog->sublen;
f8c7b90f 2003#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2004 PL_nrs = prog->saved_copy;
2005#endif
d6a28714
JH
2006 RX_MATCH_COPIED_off(prog);
2007 }
2008 else
bd61b366 2009 PL_reg_oldsaved = NULL;
d6a28714
JH
2010 prog->subbeg = PL_bostr;
2011 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2012 }
973dddac 2013 prog->startp[0] = startpos - PL_bostr;
d6a28714
JH
2014 PL_reginput = startpos;
2015 PL_regstartp = prog->startp;
2016 PL_regendp = prog->endp;
2017 PL_reglastparen = &prog->lastparen;
a01268b5 2018 PL_reglastcloseparen = &prog->lastcloseparen;
d6a28714 2019 prog->lastparen = 0;
03994de8 2020 prog->lastcloseparen = 0;
d6a28714 2021 PL_regsize = 0;
a3621e74 2022 DEBUG_EXECUTE_r(PL_reg_starttry = startpos);
d6a28714
JH
2023 if (PL_reg_start_tmpl <= prog->nparens) {
2024 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2025 if(PL_reg_start_tmp)
2026 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2027 else
a02a5408 2028 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
d6a28714
JH
2029 }
2030
2031 /* XXXX What this code is doing here?!!! There should be no need
2032 to do this again and again, PL_reglastparen should take care of
3dd2943c 2033 this! --ilya*/
dafc8851
JH
2034
2035 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2036 * Actually, the code in regcppop() (which Ilya may be meaning by
daf18116
JH
2037 * PL_reglastparen), is not needed at all by the test suite
2038 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2039 * enough, for building DynaLoader, or otherwise this
2040 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2041 * will happen. Meanwhile, this code *is* needed for the
2042 * above-mentioned test suite tests to succeed. The common theme
2043 * on those tests seems to be returning null fields from matches.
2044 * --jhi */
dafc8851 2045#if 1
d6a28714
JH
2046 sp = prog->startp;
2047 ep = prog->endp;
2048 if (prog->nparens) {
097eb12c 2049 register I32 i;
eb160463 2050 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
d6a28714
JH
2051 *++sp = -1;
2052 *++ep = -1;
2053 }
2054 }
dafc8851 2055#endif
02db2b7b 2056 REGCP_SET(lastcp);
3b0527fe 2057 if (regmatch(reginfo, prog->program + 1)) {
d6a28714
JH
2058 prog->endp[0] = PL_reginput - PL_bostr;
2059 return 1;
2060 }
02db2b7b 2061 REGCP_UNWIND(lastcp);
d6a28714
JH
2062 return 0;
2063}
2064
02db2b7b 2065
8ba1375e
MJD
2066#define sayYES goto yes
2067#define sayNO goto no
e0f9d4a8 2068#define sayNO_ANYOF goto no_anyof
8ba1375e 2069#define sayYES_FINAL goto yes_final
8ba1375e
MJD
2070#define sayNO_FINAL goto no_final
2071#define sayNO_SILENT goto do_no
2072#define saySAME(x) if (x) goto yes; else goto no
2073
3ab3c9b4 2074#define CACHEsayNO STMT_START { \
3298f257
DM
2075 if (st->u.whilem.cache_offset | st->u.whilem.cache_bit) \
2076 PL_reg_poscache[st->u.whilem.cache_offset] |= \
2077 (1<<st->u.whilem.cache_bit); \
3ab3c9b4
HS
2078 sayNO; \
2079} STMT_END
2080
3298f257 2081
a3621e74
YO
2082/* this is used to determine how far from the left messages like
2083 'failed...' are printed. Currently 29 makes these messages line
2084 up with the opcode they refer to. Earlier perls used 25 which
2085 left these messages outdented making reviewing a debug output
2086 quite difficult.
2087*/
2088#define REPORT_CODE_OFF 29
2089
2090
2091/* Make sure there is a test for this +1 options in re_tests */
2092#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2093
40a82448
DM
2094#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2095#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
9e137952 2096
86545054
DM
2097#define SLAB_FIRST(s) (&(s)->states[0])
2098#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2099
5d9a96ca
DM
2100/* grab a new slab and return the first slot in it */
2101
2102STATIC regmatch_state *
2103S_push_slab(pTHX)
2104{
54df2634
NC
2105#if PERL_VERSION < 9
2106 dMY_CXT;
2107#endif
5d9a96ca
DM
2108 regmatch_slab *s = PL_regmatch_slab->next;
2109 if (!s) {
2110 Newx(s, 1, regmatch_slab);
2111 s->prev = PL_regmatch_slab;
2112 s->next = NULL;
2113 PL_regmatch_slab->next = s;
2114 }
2115 PL_regmatch_slab = s;
86545054 2116 return SLAB_FIRST(s);
5d9a96ca 2117}
5b47454d 2118
95b24440
DM
2119/* simulate a recursive call to regmatch */
2120
2121#define REGMATCH(ns, where) \
5d9a96ca
DM
2122 st->scan = scan; \
2123 scan = (ns); \
2124 st->resume_state = resume_##where; \
95b24440
DM
2125 goto start_recurse; \
2126 resume_point_##where:
2127
40a82448
DM
2128/* push a new state then goto it */
2129
2130#define PUSH_STATE_GOTO(state, node) \
2131 scan = node; \
2132 st->resume_state = state; \
2133 goto push_state;
2134
2135/* push a new state with success backtracking, then goto it */
2136
2137#define PUSH_YES_STATE_GOTO(state, node) \
2138 scan = node; \
2139 st->resume_state = state; \
2140 goto push_yes_state;
2141
aa283a38 2142
aa283a38 2143
d6a28714
JH
2144/*
2145 - regmatch - main matching routine
2146 *
2147 * Conceptually the strategy is simple: check to see whether the current
2148 * node matches, call self recursively to see whether the rest matches,
2149 * and then act accordingly. In practice we make some effort to avoid
2150 * recursion, in particular by going through "ordinary" nodes (that don't
2151 * need to know whether the rest of the match failed) by a loop instead of
2152 * by recursion.
2153 */
2154/* [lwall] I've hoisted the register declarations to the outer block in order to
2155 * maybe save a little bit of pushing and popping on the stack. It also takes
2156 * advantage of machines that use a register save mask on subroutine entry.
95b24440
DM
2157 *
2158 * This function used to be heavily recursive, but since this had the
2159 * effect of blowing the CPU stack on complex regexes, it has been
2160 * restructured to be iterative, and to save state onto the heap rather
2161 * than the stack. Essentially whereever regmatch() used to be called, it
2162 * pushes the current state, notes where to return, then jumps back into
2163 * the main loop.
2164 *
2165 * Originally the structure of this function used to look something like
2166
2167 S_regmatch() {
2168 int a = 1, b = 2;
2169 ...
2170 while (scan != NULL) {
5d9a96ca 2171 a++; // do stuff with a and b
95b24440
DM
2172 ...
2173 switch (OP(scan)) {
2174 case FOO: {
2175 int local = 3;
2176 ...
2177 if (regmatch(...)) // recurse
2178 goto yes;
2179 }
2180 ...
2181 }
2182 }
2183 yes:
2184 return 1;
2185 }
2186
2187 * Now it looks something like this:
2188
5d9a96ca 2189 typedef struct {
95b24440
DM
2190 int a, b, local;
2191 int resume_state;
5d9a96ca 2192 } regmatch_state;
95b24440
DM
2193
2194 S_regmatch() {
5d9a96ca
DM
2195 regmatch_state *st = new();
2196 int depth=0;
2197 st->a++; // do stuff with a and b
95b24440
DM
2198 ...
2199 while (scan != NULL) {
2200 ...
2201 switch (OP(scan)) {
2202 case FOO: {
5d9a96ca 2203 st->local = 3;
95b24440 2204 ...
5d9a96ca
DM
2205 st->scan = scan;
2206 scan = ...;
2207 st->resume_state = resume_FOO;
2208 goto start_recurse; // recurse
95b24440 2209
5d9a96ca
DM
2210 resume_point_FOO:
2211 if (result)
95b24440
DM
2212 goto yes;
2213 }
2214 ...
2215 }
5d9a96ca
DM
2216 start_recurse:
2217 st = new(); push a new state
2218 st->a = 1; st->b = 2;
2219 depth++;
95b24440 2220 }
5d9a96ca 2221 yes:
95b24440 2222 result = 1;
5d9a96ca
DM
2223 if (depth--) {
2224 st = pop();
95b24440
DM
2225 switch (resume_state) {
2226 case resume_FOO:
2227 goto resume_point_FOO;
2228 ...
2229 }
2230 }
2231 return result
2232 }
2233
2234 * WARNING: this means that any line in this function that contains a
2235 * REGMATCH() or TRYPAREN() is actually simulating a recursive call to
2236 * regmatch() using gotos instead. Thus the values of any local variables
2237 * not saved in the regmatch_state structure will have been lost when
2238 * execution resumes on the next line .
5d9a96ca
DM
2239 *
2240 * States (ie the st pointer) are allocated in slabs of about 4K in size.
2241 * PL_regmatch_state always points to the currently active state, and
2242 * PL_regmatch_slab points to the slab currently containing PL_regmatch_state.
2243 * The first time regmatch is called, the first slab is allocated, and is
2244 * never freed until interpreter desctruction. When the slab is full,
2245 * a new one is allocated chained to the end. At exit from regmatch, slabs
2246 * allocated since entry are freed.
d6a28714 2247 */
95b24440 2248
40a82448 2249/* *** every FOO_fail should = FOO+1 */
c255a977
DM
2250#define TRIE_next (REGNODE_MAX+1)
2251#define TRIE_next_fail (REGNODE_MAX+2)
2252#define EVAL_A (REGNODE_MAX+3)
2253#define EVAL_A_fail (REGNODE_MAX+4)
2254#define resume_CURLYX (REGNODE_MAX+5)
2255#define resume_WHILEM1 (REGNODE_MAX+6)
2256#define resume_WHILEM2 (REGNODE_MAX+7)
2257#define resume_WHILEM3 (REGNODE_MAX+8)
2258#define resume_WHILEM4 (REGNODE_MAX+9)
2259#define resume_WHILEM5 (REGNODE_MAX+10)
2260#define resume_WHILEM6 (REGNODE_MAX+11)
2261#define BRANCH_next (REGNODE_MAX+12)
2262#define BRANCH_next_fail (REGNODE_MAX+13)
2263#define CURLYM_A (REGNODE_MAX+14)
2264#define CURLYM_A_fail (REGNODE_MAX+15)
2265#define CURLYM_B (REGNODE_MAX+16)
2266#define CURLYM_B_fail (REGNODE_MAX+17)
2267#define IFMATCH_A (REGNODE_MAX+18)
2268#define IFMATCH_A_fail (REGNODE_MAX+19)
2269#define CURLY_B_min_known (REGNODE_MAX+20)
2270#define CURLY_B_min_known_fail (REGNODE_MAX+21)
2271#define CURLY_B_min (REGNODE_MAX+22)
2272#define CURLY_B_min_fail (REGNODE_MAX+23)
2273#define CURLY_B_max (REGNODE_MAX+24)
2274#define CURLY_B_max_fail (REGNODE_MAX+25)
40a82448
DM
2275
2276
3dab1dad 2277#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 2278
3df15adc 2279#ifdef DEBUGGING
ab3bbdeb
YO
2280STATIC void
2281S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
2282 const char *start, const char *end, const char *blurb)
2283{
2284 const bool utf8_pat= prog->reganch & ROPT_UTF8 ? 1 : 0;
2285 if (!PL_colorset)
2286 reginitcolors();
2287 {
2288 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2289 prog->precomp, prog->prelen, 60);
2290
2291 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2292 start, end - start, 60);
2293
2294 PerlIO_printf(Perl_debug_log,
2295 "%s%s REx%s %s against %s\n",
2296 PL_colors[4], blurb, PL_colors[5], s0, s1);
2297
2298 if (do_utf8||utf8_pat)
2299 PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
2300 !do_utf8 ? "pattern" : !utf8_pat ? "string" :
2301 "pattern and string"
2302 );
2303 }
2304}
3df15adc
YO
2305
2306STATIC void
07be1b83
YO
2307S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const bool do_utf8)
2308{
2309 const int docolor = *PL_colors[0];
2310 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2311 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2312 /* The part of the string before starttry has one color
2313 (pref0_len chars), between starttry and current
2314 position another one (pref_len - pref0_len chars),
2315 after the current position the third one.
2316 We assume that pref0_len <= pref_len, otherwise we
2317 decrease pref0_len. */
2318 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2319 ? (5 + taill) - l : locinput - PL_bostr;
2320 int pref0_len;
2321
2322 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2323 pref_len++;
2324 pref0_len = pref_len - (locinput - PL_reg_starttry);
2325 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2326 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2327 ? (5 + taill) - pref_len : PL_regeol - locinput);
2328 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2329 l--;
2330 if (pref0_len < 0)
2331 pref0_len = 0;
2332 if (pref0_len > pref_len)
2333 pref0_len = pref_len;
2334 {
3df15adc 2335 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
0df25f3d 2336
ab3bbdeb
YO
2337 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2338 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 2339
ab3bbdeb 2340 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 2341 (locinput - pref_len + pref0_len),
ab3bbdeb 2342 pref_len - pref0_len, 60, 2, 3);
0df25f3d 2343
ab3bbdeb
YO
2344 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2345 locinput, PL_regeol - locinput, 60, 0, 1);
0df25f3d 2346
3df15adc 2347 PerlIO_printf(Perl_debug_log,
ab3bbdeb 2348 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
07be1b83 2349 (IV)(locinput - PL_bostr),
07be1b83 2350 len0, s0,
07be1b83 2351 len1, s1,
07be1b83 2352 (docolor ? "" : "> <"),
07be1b83 2353 len2, s2,
07be1b83
YO
2354 15 - l - pref_len + 1,
2355 "");
2356 }
2357}
3df15adc 2358
07be1b83
YO
2359#endif
2360
d6a28714 2361STATIC I32 /* 0 failure, 1 success */
3b0527fe 2362S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
d6a28714 2363{
54df2634
NC
2364#if PERL_VERSION < 9
2365 dMY_CXT;
2366#endif
27da23d5 2367 dVAR;
95b24440 2368 register const bool do_utf8 = PL_reg_match_utf8;
4ad0818d 2369 const U32 uniflags = UTF8_ALLOW_DEFAULT;
95b24440 2370
3b0527fe
DM
2371 regexp *rex = reginfo->prog;
2372
5d9a96ca
DM
2373 regmatch_slab *orig_slab;
2374 regmatch_state *orig_state;
a3621e74 2375
5d9a96ca
DM
2376 /* the current state. This is a cached copy of PL_regmatch_state */
2377 register regmatch_state *st;
95b24440 2378
5d9a96ca
DM
2379 /* cache heavy used fields of st in registers */
2380 register regnode *scan;
2381 register regnode *next;
2382 register I32 n = 0; /* initialize to shut up compiler warning */
2383 register char *locinput = PL_reginput;
95b24440 2384
5d9a96ca
DM
2385 /* these variables are NOT saved during a recusive RFEGMATCH: */
2386 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2387 bool result; /* return value of S_regmatch */
5d9a96ca 2388 int depth = 0; /* depth of recursion */
77cb431f
DM
2389 regmatch_state *yes_state = NULL; /* state to pop to on success of
2390 subpattern */
40a82448 2391 U32 state_num;
95b24440
DM
2392
2393#ifdef DEBUGGING
e68ec53f 2394 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
2395 PL_regindent++;
2396#endif
2397
5d9a96ca
DM
2398 /* on first ever call to regmatch, allocate first slab */
2399 if (!PL_regmatch_slab) {
2400 Newx(PL_regmatch_slab, 1, regmatch_slab);
2401 PL_regmatch_slab->prev = NULL;
2402 PL_regmatch_slab->next = NULL;
86545054 2403 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
2404 }
2405
2406 /* remember current high-water mark for exit */
2407 /* XXX this should be done with SAVE* instead */
2408 orig_slab = PL_regmatch_slab;
2409 orig_state = PL_regmatch_state;
2410
2411 /* grab next free state slot */
2412 st = ++PL_regmatch_state;
86545054 2413 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
2414 st = PL_regmatch_state = S_push_slab(aTHX);
2415
2416 st->minmod = 0;
2417 st->sw = 0;
2418 st->logical = 0;
5d9a96ca 2419 st->cc = NULL;
d6a28714
JH
2420 /* Note that nextchr is a byte even in UTF */
2421 nextchr = UCHARAT(locinput);
2422 scan = prog;
2423 while (scan != NULL) {
8ba1375e 2424
a3621e74 2425 DEBUG_EXECUTE_r( {
6136c704 2426 SV * const prop = sv_newmortal();
07be1b83 2427 dump_exec_pos( locinput, scan, do_utf8 );
32fc9b6a 2428 regprop(rex, prop, scan);
07be1b83
YO
2429
2430 PerlIO_printf(Perl_debug_log,
2431 "%3"IVdf":%*s%s(%"IVdf")\n",
2432 (IV)(scan - rex->program), PL_regindent*2, "",
2433 SvPVX_const(prop),
2434 PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
2a782b5b 2435 });
d6a28714
JH
2436
2437 next = scan + NEXT_OFF(scan);
2438 if (next == scan)
2439 next = NULL;
40a82448 2440 state_num = OP(scan);
d6a28714 2441
40a82448
DM
2442 reenter_switch:
2443 switch (state_num) {
d6a28714 2444 case BOL:
7fba1cd6 2445 if (locinput == PL_bostr)
d6a28714 2446 {
3b0527fe 2447 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
2448 break;
2449 }
d6a28714
JH
2450 sayNO;
2451 case MBOL:
12d33761
HS
2452 if (locinput == PL_bostr ||
2453 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
d6a28714 2454 {
b8c5462f
JH
2455 break;
2456 }
d6a28714
JH
2457 sayNO;
2458 case SBOL:
c2a73568 2459 if (locinput == PL_bostr)
b8c5462f 2460 break;
d6a28714
JH
2461 sayNO;
2462 case GPOS:
3b0527fe 2463 if (locinput == reginfo->ganch)
d6a28714
JH
2464 break;
2465 sayNO;
2466 case EOL:
d6a28714
JH
2467 goto seol;
2468 case MEOL:
d6a28714 2469 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2470 sayNO;
b8c5462f 2471 break;
d6a28714
JH
2472 case SEOL:
2473 seol:
2474 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
b8c5462f 2475 sayNO;
d6a28714 2476 if (PL_regeol - locinput > 1)
b8c5462f 2477 sayNO;
b8c5462f 2478 break;
d6a28714
JH
2479 case EOS:
2480 if (PL_regeol != locinput)
b8c5462f 2481 sayNO;
d6a28714 2482 break;
ffc61ed2 2483 case SANY:
d6a28714 2484 if (!nextchr && locinput >= PL_regeol)
4633a7c4 2485 sayNO;
f33976b4
DB
2486 if (do_utf8) {
2487 locinput += PL_utf8skip[nextchr];
2488 if (locinput > PL_regeol)
2489 sayNO;
2490 nextchr = UCHARAT(locinput);
2491 }
2492 else
2493 nextchr = UCHARAT(++locinput);
2494 break;
2495 case CANY:
2496 if (!nextchr && locinput >= PL_regeol)
2497 sayNO;
b8c5462f 2498 nextchr = UCHARAT(++locinput);
a0d0e21e 2499 break;
ffc61ed2 2500 case REG_ANY:
1aa99e6b
IH
2501 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2502 sayNO;
2503 if (do_utf8) {
b8c5462f 2504 locinput += PL_utf8skip[nextchr];
d6a28714
JH
2505 if (locinput > PL_regeol)
2506 sayNO;
a0ed51b3 2507 nextchr = UCHARAT(locinput);
a0ed51b3 2508 }
1aa99e6b
IH
2509 else
2510 nextchr = UCHARAT(++locinput);
a0ed51b3 2511 break;
166ba7cd
DM
2512
2513#undef ST
2514#define ST st->u.trie
2515
5b47454d 2516 case TRIE:
3dab1dad 2517 {
07be1b83 2518 /* what type of TRIE am I? (utf8 makes this contextual) */
3dab1dad
YO
2519 const enum { trie_plain, trie_utf8, trie_utf8_fold }
2520 trie_type = do_utf8 ?
2521 (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
2522 : trie_plain;
2523
2524 /* what trie are we using right now */
be8e71aa 2525 reg_trie_data * const trie
3dab1dad
YO
2526 = (reg_trie_data*)rex->data->data[ ARG( scan ) ];
2527 U32 state = trie->startstate;
166ba7cd
DM
2528
2529 U8 *uc = ( U8* )locinput;
2530 U16 charid = 0;
2531 U32 base = 0;
2532 UV uvc = 0;
2533 STRLEN len = 0;
2534 STRLEN foldlen = 0;
2535 U8 *uscan = (U8*)NULL;
2536 STRLEN bufflen=0;
2537 SV *sv_accept_buff = NULL;
2538 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2539
2540 ST.accepted = 0; /* how many accepting states we have seen */
2541 ST.B = next;
2542#ifdef DEBUGGING
2543 ST.me = scan;
2544#endif
3dab1dad
YO
2545
2546 if (trie->bitmap && trie_type != trie_utf8_fold &&
2547 !TRIE_BITMAP_TEST(trie,*locinput)
2548 ) {
2549 if (trie->states[ state ].wordnum) {
2550 DEBUG_EXECUTE_r(
2551 PerlIO_printf(Perl_debug_log,
2552 "%*s %smatched empty string...%s\n",
2553 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2554 );
2555 break;
2556 } else {
2557 DEBUG_EXECUTE_r(
2558 PerlIO_printf(Perl_debug_log,
2559 "%*s %sfailed to match start class...%s\n",
2560 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4], PL_colors[5])
2561 );
2562 sayNO_SILENT;
2563 }
2564 }
166ba7cd 2565
07be1b83
YO
2566 /*
2567 traverse the TRIE keeping track of all accepting states
2568 we transition through until we get to a failing node.
2569 */
2570
a3621e74
YO
2571 while ( state && uc <= (U8*)PL_regeol ) {
2572
5b47454d 2573 if (trie->states[ state ].wordnum) {
166ba7cd 2574 if (!ST.accepted ) {
5b47454d
DM
2575 ENTER;
2576 SAVETMPS;
2577 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
2578 sv_accept_buff=newSV(bufflen *
2579 sizeof(reg_trie_accepted) - 1);
2580 SvCUR_set(sv_accept_buff,
2581 sizeof(reg_trie_accepted));
2582 SvPOK_on(sv_accept_buff);
2583 sv_2mortal(sv_accept_buff);
166ba7cd
DM
2584 SAVETMPS;
2585 ST.accept_buff =
5b47454d
DM
2586 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
2587 }
2588 else {
166ba7cd 2589 if (ST.accepted >= bufflen) {
5b47454d 2590 bufflen *= 2;
166ba7cd 2591 ST.accept_buff =(reg_trie_accepted*)
5b47454d
DM
2592 SvGROW(sv_accept_buff,
2593 bufflen * sizeof(reg_trie_accepted));
2594 }
2595 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
2596 + sizeof(reg_trie_accepted));
2597 }
166ba7cd
DM
2598 ST.accept_buff[ST.accepted].wordnum = trie->states[state].wordnum;
2599 ST.accept_buff[ST.accepted].endpos = uc;
2600 ++ST.accepted;
5b47454d 2601 }
a3621e74
YO
2602
2603 base = trie->states[ state ].trans.base;
2604
07be1b83
YO
2605 DEBUG_TRIE_EXECUTE_r({
2606 dump_exec_pos( (char *)uc, scan, do_utf8 );
a3621e74 2607 PerlIO_printf( Perl_debug_log,
e4584336 2608 "%*s %sState: %4"UVxf", Base: %4"UVxf", Accepted: %4"UVxf" ",
07be1b83 2609 2+PL_regindent * 2, "", PL_colors[4],
166ba7cd 2610 (UV)state, (UV)base, (UV)ST.accepted );
07be1b83 2611 });
a3621e74
YO
2612
2613 if ( base ) {
4cadc6a9
YO
2614 REXEC_TRIE_READ_CHAR(trie_type, trie, uc, uscan, len,
2615 uvc, charid, foldlen, foldbuf, uniflags);
a3621e74 2616
5b47454d
DM
2617 if (charid &&
2618 (base + charid > trie->uniquecharcount )
2619 && (base + charid - 1 - trie->uniquecharcount
2620 < trie->lasttrans)
2621 && trie->trans[base + charid - 1 -
2622 trie->uniquecharcount].check == state)
2623 {
2624 state = trie->trans[base + charid - 1 -
2625 trie->uniquecharcount ].next;
2626 }
2627 else {
2628 state = 0;
2629 }
2630 uc += len;
2631
2632 }
2633 else {
a3621e74
YO
2634 state = 0;
2635 }
2636 DEBUG_TRIE_EXECUTE_r(
e4584336
RB
2637 PerlIO_printf( Perl_debug_log,
2638 "Charid:%3x CV:%4"UVxf" After State: %4"UVxf"%s\n",
2639 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
2640 );
2641 }
166ba7cd 2642 if (!ST.accepted )
a3621e74 2643 sayNO;
a3621e74 2644
166ba7cd
DM
2645 DEBUG_EXECUTE_r(
2646 PerlIO_printf( Perl_debug_log,
2647 "%*s %sgot %"IVdf" possible matches%s\n",
2648 REPORT_CODE_OFF + PL_regindent * 2, "",
2649 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
2650 );
2651 }
2652
2653 /* FALL THROUGH */
2654
2655 case TRIE_next_fail: /* we failed - try next alterative */
2656
2657 if ( ST.accepted == 1 ) {
2658 /* only one choice left - just continue */
2659 DEBUG_EXECUTE_r({
2660 reg_trie_data * const trie
2661 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2662 SV ** const tmp = RX_DEBUG(reginfo->prog)
2663 ? av_fetch( trie->words, ST.accept_buff[ 0 ].wordnum-1, 0 )
2664 : NULL;
2665 PerlIO_printf( Perl_debug_log,
2666 "%*s %sonly one match left: #%d <%s>%s\n",
2667 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2668 ST.accept_buff[ 0 ].wordnum,
2669 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr",
2670 PL_colors[5] );
2671 });
2672 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
2673 /* in this case we free tmps/leave before we call regmatch
2674 as we wont be using accept_buff again. */
2675 FREETMPS;
2676 LEAVE;
2677 locinput = PL_reginput;
2678 nextchr = UCHARAT(locinput);
2679 scan = ST.B;
2680 continue; /* execute rest of RE */
2681 }
2682
2683 if (!ST.accepted-- ) {
2684 FREETMPS;
2685 LEAVE;
2686 sayNO;
2687 }
2688
a3621e74 2689 /*
166ba7cd
DM
2690 There are at least two accepting states left. Presumably
2691 the number of accepting states is going to be low,
2692 typically two. So we simply scan through to find the one
2693 with lowest wordnum. Once we find it, we swap the last
2694 state into its place and decrement the size. We then try to
2695 match the rest of the pattern at the point where the word
2696 ends. If we succeed, control just continues along the
2697 regex; if we fail we return here to try the next accepting
2698 state
2699 */
a3621e74 2700
166ba7cd
DM
2701 {
2702 U32 best = 0;
2703 U32 cur;
2704 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
2705 DEBUG_TRIE_EXECUTE_r(
f2278c82 2706 PerlIO_printf( Perl_debug_log,
166ba7cd
DM
2707 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
2708 REPORT_CODE_OFF + PL_regindent * 2, "", PL_colors[4],
2709 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
2710 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
2711 );
2712
2713 if (ST.accept_buff[cur].wordnum <
2714 ST.accept_buff[best].wordnum)
2715 best = cur;
a3621e74 2716 }
166ba7cd
DM
2717
2718 DEBUG_EXECUTE_r({
2719 reg_trie_data * const trie
2720 = (reg_trie_data*)rex->data->data[ ARG(ST.me) ];
2721 SV ** const tmp = RX_DEBUG(reginfo->prog)
2722 ? av_fetch( trie->words, ST.accept_buff[ best ].wordnum - 1, 0 )
2723 : NULL;
2724 PerlIO_printf( Perl_debug_log, "%*s %strying alternation #%d <%s> at node #%d %s\n",
2725 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],
2726 ST.accept_buff[best].wordnum,
2727 tmp ? SvPV_nolen_const( *tmp ) : "not compiled under -Dr", REG_NODE_NUM(scan),
2728 PL_colors[5] );
2729 });
2730
2731 if ( best<ST.accepted ) {
2732 reg_trie_accepted tmp = ST.accept_buff[ best ];
2733 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
2734 ST.accept_buff[ ST.accepted ] = tmp;
2735 best = ST.accepted;
a3621e74 2736 }
166ba7cd
DM
2737 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
2738 }
2739 PUSH_STATE_GOTO(TRIE_next, ST.B);
2740 /* NOTREACHED */
2741
2742#undef ST
2743
95b24440
DM
2744 case EXACT: {
2745 char *s = STRING(scan);
5d9a96ca 2746 st->ln = STR_LEN(scan);
eb160463 2747 if (do_utf8 != UTF) {
bc517b45 2748 /* The target and the pattern have differing utf8ness. */
1aa99e6b 2749 char *l = locinput;
be8e71aa 2750 const char * const e = s + st->ln;
a72c7584 2751
5ff6fc6d
JH
2752 if (do_utf8) {
2753 /* The target is utf8, the pattern is not utf8. */
1aa99e6b 2754 while (s < e) {
a3b680e6 2755 STRLEN ulen;
1aa99e6b 2756 if (l >= PL_regeol)
5ff6fc6d
JH
2757 sayNO;
2758 if (NATIVE_TO_UNI(*(U8*)s) !=
89ebb4a3 2759 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
041457d9 2760 uniflags))
5ff6fc6d 2761 sayNO;
bc517b45 2762 l += ulen;
5ff6fc6d 2763 s ++;
1aa99e6b 2764 }
5ff6fc6d
JH
2765 }
2766 else {
2767 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 2768 while (s < e) {
a3b680e6 2769 STRLEN ulen;
1aa99e6b
IH
2770 if (l >= PL_regeol)
2771 sayNO;
5ff6fc6d 2772 if (NATIVE_TO_UNI(*((U8*)l)) !=
89ebb4a3 2773 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
041457d9 2774 uniflags))
1aa99e6b 2775 sayNO;
bc517b45 2776 s += ulen;
a72c7584 2777 l ++;
1aa99e6b 2778 }
5ff6fc6d 2779 }
1aa99e6b
IH
2780 locinput = l;
2781 nextchr = UCHARAT(locinput);
2782 break;
2783 }
bc517b45 2784 /* The target and the pattern have the same utf8ness. */
d6a28714
JH
2785 /* Inline the first character, for speed. */
2786 if (UCHARAT(s) != nextchr)
2787 sayNO;
5d9a96ca 2788 if (PL_regeol - locinput < st->ln)
d6a28714 2789 sayNO;
5d9a96ca 2790 if (st->ln > 1 && memNE(s, locinput, st->ln))
d6a28714 2791 sayNO;
5d9a96ca 2792 locinput += st->ln;
d6a28714
JH
2793 nextchr = UCHARAT(locinput);
2794 break;
95b24440 2795 }
d6a28714 2796 case EXACTFL:
b8c5462f
JH
2797 PL_reg_flags |= RF_tainted;
2798 /* FALL THROUGH */
95b24440 2799 case EXACTF: {
be8e71aa 2800 char * const s = STRING(scan);
5d9a96ca 2801 st->ln = STR_LEN(scan);
d6a28714 2802
d07ddd77
JH
2803 if (do_utf8 || UTF) {
2804 /* Either target or the pattern are utf8. */
be8e71aa 2805 const char * const l = locinput;
d07ddd77 2806 char *e = PL_regeol;
bc517b45 2807
5d9a96ca 2808 if (ibcmp_utf8(s, 0, st->ln, (bool)UTF,
1feea2c7 2809 l, &e, 0, do_utf8)) {
5486206c
JH
2810 /* One more case for the sharp s:
2811 * pack("U0U*", 0xDF) =~ /ss/i,
2812 * the 0xC3 0x9F are the UTF-8
2813 * byte sequence for the U+00DF. */
2814 if (!(do_utf8 &&
2815 toLOWER(s[0]) == 's' &&
5d9a96ca 2816 st->ln >= 2 &&
5486206c
JH
2817 toLOWER(s[1]) == 's' &&
2818 (U8)l[0] == 0xC3 &&
2819 e - l >= 2 &&
2820 (U8)l[1] == 0x9F))
2821 sayNO;
2822 }
d07ddd77
JH
2823 locinput = e;
2824 nextchr = UCHARAT(locinput);
2825 break;
a0ed51b3 2826 }
d6a28714 2827
bc517b45
JH
2828 /* Neither the target and the pattern are utf8. */
2829
d6a28714
JH
2830 /* Inline the first character, for speed. */
2831 if (UCHARAT(s) != nextchr &&
2832 UCHARAT(s) != ((OP(scan) == EXACTF)
2833 ? PL_fold : PL_fold_locale)[nextchr])
a0ed51b3 2834 sayNO;
5d9a96ca 2835 if (PL_regeol - locinput < st->ln)
b8c5462f 2836 sayNO;
5d9a96ca
DM
2837 if (st->ln > 1 && (OP(scan) == EXACTF
2838 ? ibcmp(s, locinput, st->ln)
2839 : ibcmp_locale(s, locinput, st->ln)))
4633a7c4 2840 sayNO;
5d9a96ca 2841 locinput += st->ln;
d6a28714 2842 nextchr = UCHARAT(locinput);
a0d0e21e 2843 break;
95b24440 2844 }
d6a28714 2845 case ANYOF:
ffc61ed2 2846 if (do_utf8) {
9e55ce06
JH
2847 STRLEN inclasslen = PL_regeol - locinput;
2848
32fc9b6a 2849 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
e0f9d4a8 2850 sayNO_ANYOF;
ffc61ed2
JH
2851 if (locinput >= PL_regeol)
2852 sayNO;
0f0076b4 2853 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
b8c5462f 2854 nextchr = UCHARAT(locinput);
e0f9d4a8 2855 break;
ffc61ed2
JH
2856 }
2857 else {
2858 if (nextchr < 0)
2859 nextchr = UCHARAT(locinput);
32fc9b6a 2860 if (!REGINCLASS(rex, scan, (U8*)locinput))
e0f9d4a8 2861 sayNO_ANYOF;
ffc61ed2
JH
2862 if (!nextchr && locinput >= PL_regeol)
2863 sayNO;
2864 nextchr = UCHARAT(++locinput);
e0f9d4a8
JH
2865 break;
2866 }
2867 no_anyof:
2868 /* If we might have the case of the German sharp s
2869 * in a casefolding Unicode character class. */
2870
ebc501f0
JH
2871 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2872 locinput += SHARP_S_SKIP;
e0f9d4a8 2873 nextchr = UCHARAT(locinput);
ffc61ed2 2874 }
e0f9d4a8
JH
2875 else
2876 sayNO;
b8c5462f 2877 break;
d6a28714 2878 case ALNUML:
b8c5462f
JH
2879 PL_reg_flags |= RF_tainted;
2880 /* FALL THROUGH */
d6a28714 2881 case ALNUM:
b8c5462f 2882 if (!nextchr)
4633a7c4 2883 sayNO;
ffc61ed2 2884 if (do_utf8) {
1a4fad37 2885 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2886 if (!(OP(scan) == ALNUM
bb7a0f54 2887 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714 2888 : isALNUM_LC_utf8((U8*)locinput)))
b8c5462f
JH
2889 {
2890 sayNO;
a0ed51b3 2891 }
b8c5462f 2892 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
2893 nextchr = UCHARAT(locinput);
2894 break;
2895 }
ffc61ed2 2896 if (!(OP(scan) == ALNUM
d6a28714 2897 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
bbce6d69 2898 sayNO;
b8c5462f 2899 nextchr = UCHARAT(++locinput);
a0d0e21e 2900 break;
d6a28714 2901 case NALNUML:
b8c5462f
JH
2902 PL_reg_flags |= RF_tainted;
2903 /* FALL THROUGH */
d6a28714
JH
2904 case NALNUM:
2905 if (!nextchr && locinput >= PL_regeol)
a0ed51b3 2906 sayNO;
ffc61ed2 2907 if (do_utf8) {
1a4fad37 2908 LOAD_UTF8_CHARCLASS_ALNUM();
ffc61ed2 2909 if (OP(scan) == NALNUM
bb7a0f54 2910 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
d6a28714
JH
2911 : isALNUM_LC_utf8((U8*)locinput))
2912 {
b8c5462f 2913 sayNO;
d6a28714 2914 }
b8c5462f
JH
2915 locinput += PL_utf8skip[nextchr];
2916 nextchr = UCHARAT(locinput);
2917 break;
2918 }
ffc61ed2 2919 if (OP(scan) == NALNUM
d6a28714 2920 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
b8c5462f 2921 sayNO;
76e3520e 2922 nextchr = UCHARAT(++locinput);
a0d0e21e 2923 break;
d6a28714
JH
2924 case BOUNDL:
2925 case NBOUNDL:
3280af22 2926 PL_reg_flags |= RF_tainted;
bbce6d69 2927 /* FALL THROUGH */
d6a28714
JH
2928 case BOUND:
2929 case NBOUND:
2930 /* was last char in word? */
ffc61ed2 2931 if (do_utf8) {
12d33761 2932 if (locinput == PL_bostr)
5d9a96ca 2933 st->ln = '\n';
ffc61ed2 2934 else {
a3b680e6 2935 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
9041c2e3 2936
4ad0818d 2937 st->ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
ffc61ed2
JH
2938 }
2939 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2940 st->ln = isALNUM_uni(st->ln);
1a4fad37 2941 LOAD_UTF8_CHARCLASS_ALNUM();
3568d838 2942 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
ffc61ed2
JH
2943 }
2944 else {
5d9a96ca 2945 st->ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(st->ln));
ffc61ed2
JH
2946 n = isALNUM_LC_utf8((U8*)locinput);
2947 }
a0ed51b3 2948 }
d6a28714 2949 else {
5d9a96ca 2950 st->ln = (locinput != PL_bostr) ?
12d33761 2951 UCHARAT(locinput - 1) : '\n';
ffc61ed2 2952 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
5d9a96ca 2953 st->ln = isALNUM(st->ln);
ffc61ed2
JH
2954 n = isALNUM(nextchr);
2955 }
2956 else {
5d9a96ca 2957 st->ln = isALNUM_LC(st->ln);
ffc61ed2
JH
2958 n = isALNUM_LC(nextchr);
2959 }
d6a28714 2960 }
5d9a96ca 2961 if (((!st->ln) == (!n)) == (OP(scan) == BOUND ||
ffc61ed2
JH
2962 OP(scan) == BOUNDL))
2963 sayNO;
a0ed51b3 2964 break;
d6a28714 2965 case SPACEL:
3280af22 2966 PL_reg_flags |= RF_tainted;
bbce6d69 2967 /* FALL THROUGH */
d6a28714 2968 case SPACE:
9442cb0e 2969 if (!nextchr)
4633a7c4 2970 sayNO;
1aa99e6b 2971 if (do_utf8) {
fd400ab9 2972 if (UTF8_IS_CONTINUED(nextchr)) {
1a4fad37 2973 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 2974 if (!(OP(scan) == SPACE
bb7a0f54 2975 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
ffc61ed2
JH
2976 : isSPACE_LC_utf8((U8*)locinput)))
2977 {
2978 sayNO;
2979 }
2980 locinput += PL_utf8skip[nextchr];
2981 nextchr = UCHARAT(locinput);
2982 break;
d6a28714 2983 }
ffc61ed2
JH
2984 if (!(OP(scan) == SPACE
2985 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2986 sayNO;
2987 nextchr = UCHARAT(++locinput);
2988 }
2989 else {
2990 if (!(OP(scan) == SPACE
2991 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2992 sayNO;
2993 nextchr = UCHARAT(++locinput);
a0ed51b3 2994 }
a0ed51b3 2995 break;
d6a28714 2996 case NSPACEL:
3280af22 2997 PL_reg_flags |= RF_tainted;
bbce6d69 2998 /* FALL THROUGH */
d6a28714 2999 case NSPACE:
9442cb0e 3000 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3001 sayNO;
1aa99e6b 3002 if (do_utf8) {
1a4fad37 3003 LOAD_UTF8_CHARCLASS_SPACE();
ffc61ed2 3004 if (OP(scan) == NSPACE
bb7a0f54 3005 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
d6a28714 3006 : isSPACE_LC_utf8((U8*)locinput))
b8c5462f
JH
3007 {
3008 sayNO;
3009 }
3010 locinput += PL_utf8skip[nextchr];
3011 nextchr = UCHARAT(locinput);
3012 break;
a0ed51b3 3013 }
ffc61ed2 3014 if (OP(scan) == NSPACE
d6a28714 3015 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
4633a7c4 3016 sayNO;
76e3520e 3017 nextchr = UCHARAT(++locinput);
a0d0e21e 3018 break;
d6a28714 3019 case DIGITL:
a0ed51b3
LW
3020 PL_reg_flags |= RF_tainted;
3021 /* FALL THROUGH */
d6a28714 3022 case DIGIT:
9442cb0e 3023 if (!nextchr)
a0ed51b3 3024 sayNO;
1aa99e6b 3025 if (do_utf8) {
1a4fad37 3026 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3027 if (!(OP(scan) == DIGIT
bb7a0f54 3028 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e 3029 : isDIGIT_LC_utf8((U8*)locinput)))
dfe13c55 3030 {
a0ed51b3 3031 sayNO;
dfe13c55 3032 }
6f06b55f 3033 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3034 nextchr = UCHARAT(locinput);
3035 break;
3036 }
ffc61ed2 3037 if (!(OP(scan) == DIGIT
9442cb0e 3038 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
a0ed51b3
LW
3039 sayNO;
3040 nextchr = UCHARAT(++locinput);
3041 break;
d6a28714 3042 case NDIGITL:
b8c5462f
JH
3043 PL_reg_flags |= RF_tainted;
3044 /* FALL THROUGH */
d6a28714 3045 case NDIGIT:
9442cb0e 3046 if (!nextchr && locinput >= PL_regeol)
b8c5462f 3047 sayNO;
1aa99e6b 3048 if (do_utf8) {
1a4fad37 3049 LOAD_UTF8_CHARCLASS_DIGIT();
ffc61ed2 3050 if (OP(scan) == NDIGIT
bb7a0f54 3051 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
9442cb0e
GS
3052 : isDIGIT_LC_utf8((U8*)locinput))
3053 {
a0ed51b3 3054 sayNO;
9442cb0e 3055 }
6f06b55f 3056 locinput += PL_utf8skip[nextchr];
a0ed51b3
LW
3057 nextchr = UCHARAT(locinput);
3058 break;
3059 }
ffc61ed2 3060 if (OP(scan) == NDIGIT
9442cb0e 3061 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
a0ed51b3
LW
3062 sayNO;
3063 nextchr = UCHARAT(++locinput);
3064 break;
3065 case CLUMP:
b7c83a7e 3066 if (locinput >= PL_regeol)
a0ed51b3 3067 sayNO;
b7c83a7e 3068 if (do_utf8) {
1a4fad37 3069 LOAD_UTF8_CHARCLASS_MARK();
b7c83a7e
JH
3070 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3071 sayNO;
3072 locinput += PL_utf8skip[nextchr];
3073 while (locinput < PL_regeol &&
3074 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3075 locinput += UTF8SKIP(locinput);
3076 if (locinput > PL_regeol)
3077 sayNO;
eb08e2da
JH
3078 }
3079 else
3080 locinput++;
a0ed51b3
LW
3081 nextchr = UCHARAT(locinput);
3082 break;
c8756f30 3083 case REFFL:
3280af22 3084 PL_reg_flags |= RF_tainted;
c8756f30 3085 /* FALL THROUGH */
c277df42 3086 case REF:
95b24440
DM
3087 case REFF: {
3088 char *s;
c277df42 3089 n = ARG(scan); /* which paren pair */
5d9a96ca 3090 st->ln = PL_regstartp[n];
2c2d71f5 3091 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3092 if ((I32)*PL_reglastparen < n || st->ln == -1)
af3f8c16 3093 sayNO; /* Do not match unless seen CLOSEn. */
5d9a96ca 3094 if (st->ln == PL_regendp[n])
a0d0e21e 3095 break;
a0ed51b3 3096
5d9a96ca 3097 s = PL_bostr + st->ln;
1aa99e6b 3098 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
a0ed51b3 3099 char *l = locinput;
a3b680e6 3100 const char *e = PL_bostr + PL_regendp[n];
a0ed51b3
LW
3101 /*
3102 * Note that we can't do the "other character" lookup trick as
3103 * in the 8-bit case (no pun intended) because in Unicode we
3104 * have to map both upper and title case to lower case.
3105 */
3106 if (OP(scan) == REFF) {
3107 while (s < e) {
a3b680e6
AL
3108 STRLEN ulen1, ulen2;
3109 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3110 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3111
a0ed51b3
LW
3112 if (l >= PL_regeol)
3113 sayNO;
a2a2844f
JH
3114 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3115 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
7114a2d2 3116 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
a0ed51b3 3117 sayNO;
a2a2844f
JH
3118 s += ulen1;
3119 l += ulen2;
a0ed51b3
LW
3120 }
3121 }
3122 locinput = l;
3123 nextchr = UCHARAT(locinput);
3124 break;
3125 }
3126
a0d0e21e 3127 /* Inline the first character, for speed. */
76e3520e 3128 if (UCHARAT(s) != nextchr &&
c8756f30
AK
3129 (OP(scan) == REF ||
3130 (UCHARAT(s) != ((OP(scan) == REFF
22c35a8c 3131 ? PL_fold : PL_fold_locale)[nextchr]))))
4633a7c4 3132 sayNO;
5d9a96ca
DM
3133 st->ln = PL_regendp[n] - st->ln;
3134 if (locinput + st->ln > PL_regeol)
4633a7c4 3135 sayNO;
5d9a96ca
DM
3136 if (st->ln > 1 && (OP(scan) == REF
3137 ? memNE(s, locinput, st->ln)
c8756f30 3138 : (OP(scan) == REFF
5d9a96ca
DM
3139 ? ibcmp(s, locinput, st->ln)
3140 : ibcmp_locale(s, locinput, st->ln))))
4633a7c4 3141 sayNO;
5d9a96ca 3142 locinput += st->ln;
76e3520e 3143 nextchr = UCHARAT(locinput);
a0d0e21e 3144 break;
95b24440 3145 }
a0d0e21e
LW
3146
3147 case NOTHING:
c277df42 3148 case TAIL:
a0d0e21e
LW
3149 break;
3150 case BACK:
3151 break;
40a82448
DM
3152
3153#undef ST
3154#define ST st->u.eval
3155
3156 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
c277df42 3157 {
c277df42 3158 SV *ret;
8e5e9ebe 3159 {
4aabdb9b
DM
3160 /* execute the code in the {...} */
3161 dSP;
6136c704 3162 SV ** const before = SP;
4aabdb9b
DM
3163 OP_4tree * const oop = PL_op;
3164 COP * const ocurcop = PL_curcop;
3165 PAD *old_comppad;
4aabdb9b
DM
3166
3167 n = ARG(scan);
32fc9b6a 3168 PL_op = (OP_4tree*)rex->data->data[n];
4aabdb9b 3169 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
32fc9b6a 3170 PAD_SAVE_LOCAL(old_comppad, (PAD*)rex->data->data[n + 2]);
4aabdb9b
DM
3171 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
3172
8e5e9ebe
RGS
3173 CALLRUNOPS(aTHX); /* Scalar context. */
3174 SPAGAIN;
3175 if (SP == before)
075aa684 3176 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
3177 else {
3178 ret = POPs;
3179 PUTBACK;
3180 }
4aabdb9b
DM
3181
3182 PL_op = oop;
3183 PAD_RESTORE_LOCAL(old_comppad);
3184 PL_curcop = ocurcop;
3185 if (!st->logical) {
3186 /* /(?{...})/ */
3187 sv_setsv(save_scalar(PL_replgv), ret);
4aabdb9b
DM
3188 break;
3189 }
8e5e9ebe 3190 }
4aabdb9b
DM
3191 if (st->logical == 2) { /* Postponed subexpression: /(??{...})/ */
3192 regexp *re;
4aabdb9b 3193 {
4f639d21
DM
3194 /* extract RE object from returned value; compiling if
3195 * necessary */
3196
6136c704 3197 MAGIC *mg = NULL;
be8e71aa 3198 const SV *sv;
faf82a0b
AE
3199 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
3200 mg = mg_find(sv, PERL_MAGIC_qr);
3201 else if (SvSMAGICAL(ret)) {
3202 if (SvGMAGICAL(ret))
3203 sv_unmagic(ret, PERL_MAGIC_qr);
3204 else
3205 mg = mg_find(ret, PERL_MAGIC_qr);
0f5d15d6 3206 }
faf82a0b 3207
0f5d15d6
IZ
3208 if (mg) {
3209 re = (regexp *)mg->mg_obj;
df0003d4 3210 (void)ReREFCNT_inc(re);
0f5d15d6
IZ
3211 }
3212 else {
3213 STRLEN len;
6136c704 3214 const char * const t = SvPV_const(ret, len);
0f5d15d6 3215 PMOP pm;
a3b680e6 3216 const I32 osize = PL_regsize;
0f5d15d6 3217
5fcd1c1b 3218 Zero(&pm, 1, PMOP);
4aabdb9b 3219 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
83003860 3220 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
9041c2e3 3221 if (!(SvFLAGS(ret)
faf82a0b
AE
3222 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3223 | SVs_GMG)))
14befaf4
DM
3224 sv_magic(ret,(SV*)ReREFCNT_inc(re),
3225 PERL_MAGIC_qr,0,0);
0f5d15d6 3226 PL_regsize = osize;
0f5d15d6 3227 }
4aabdb9b 3228 }
aa283a38
DM
3229
3230 /* run the pattern returned from (??{...}) */
4aabdb9b 3231 DEBUG_EXECUTE_r(
ab3bbdeb
YO
3232 debug_start_match(re, do_utf8, locinput, PL_regeol,
3233 "Matching embedded");
4aabdb9b 3234 );
2c2d71f5 3235
40a82448
DM
3236 ST.cp = regcppush(0); /* Save *all* the positions. */
3237 REGCP_SET(ST.lastcp);
4aabdb9b
DM
3238 *PL_reglastparen = 0;
3239 *PL_reglastcloseparen = 0;
4aabdb9b 3240 PL_reginput = locinput;
4aabdb9b
DM
3241
3242 /* XXXX This is too dramatic a measure... */
3243 PL_reg_maxiter = 0;
3244
5d9a96ca 3245 st->logical = 0;
40a82448 3246 ST.toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
aa283a38 3247 ((re->reganch & ROPT_UTF8) != 0);
40a82448
DM
3248 if (ST.toggleutf) PL_reg_flags ^= RF_utf8;
3249 ST.prev_rex = rex;
aa283a38 3250 rex = re;
aa283a38 3251
40a82448 3252 ST.B = next;
aa283a38 3253 /* now continue from first node in postoned RE */
40a82448 3254 PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
4aabdb9b 3255 /* NOTREACHED */
a0ed51b3 3256 }
4aabdb9b
DM
3257 /* /(?(?{...})X|Y)/ */
3258 st->sw = SvTRUE(ret);
3259 st->logical = 0;
c277df42
IZ
3260 break;
3261 }
40a82448
DM
3262
3263 case EVAL_A: /* successfully ran inner rex (??{rex}) */
3264 if (ST.toggleutf)
3265 PL_reg_flags ^= RF_utf8;
3266 ReREFCNT_dec(rex);
3267 rex = ST.prev_rex;
3268 /* XXXX This is too dramatic a measure... */
3269 PL_reg_maxiter = 0;
3270 /* Restore parens of the caller without popping the
3271 * savestack */
3272 {
3273 const I32 tmp = PL_savestack_ix;
3274 PL_savestack_ix = ST.lastcp;
3275 regcppop(rex);
3276 PL_savestack_ix = tmp;
3277 }
3278 PL_reginput = locinput;
3279 /* continue at the node following the (??{...}) */
3280 scan = ST.B;
3281 continue;
3282
3283 case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
3284 /* Restore state to the outer re then re-throw the failure */
3285 if (ST.toggleutf)
3286 PL_reg_flags ^= RF_utf8;
3287 ReREFCNT_dec(rex);
3288 rex = ST.prev_rex;
3289
3290 /* XXXX This is too dramatic a measure... */
3291 PL_reg_maxiter = 0;
3292
3293 PL_reginput = locinput;
3294 REGCP_UNWIND(ST.lastcp);
3295 regcppop(rex);
3296 sayNO_SILENT;
3297
3298#undef ST
3299
a0d0e21e 3300 case OPEN:
c277df42 3301 n = ARG(scan); /* which paren pair */
3280af22
NIS
3302 PL_reg_start_tmp[n] = locinput;
3303 if (n > PL_regsize)
3304 PL_regsize = n;
a0d0e21e
LW
3305 break;
3306 case CLOSE:
c277df42 3307 n = ARG(scan); /* which paren pair */
cf93c79d
IZ
3308 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3309 PL_regendp[n] = locinput - PL_bostr;
eb160463 3310 if (n > (I32)*PL_reglastparen)
3280af22 3311 *PL_reglastparen = n;
a01268b5 3312 *PL_reglastcloseparen = n;
a0d0e21e 3313 break;
c277df42
IZ
3314 case GROUPP:
3315 n = ARG(scan); /* which paren pair */
5d9a96ca 3316 st->sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
c277df42
IZ
3317 break;
3318 case IFTHEN:
2c2d71f5 3319 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5d9a96ca 3320 if (st->sw)
c277df42
IZ
3321 next = NEXTOPER(NEXTOPER(scan));
3322 else {
3323 next = scan + ARG(scan);
3324 if (OP(next) == IFTHEN) /* Fake one. */
3325 next = NEXTOPER(NEXTOPER(next));
3326 }
3327 break;
3328 case LOGICAL:
5d9a96ca 3329 st->logical = scan->flags;
c277df42 3330 break;
2ab05381 3331/*******************************************************************
a0374537
DM
3332 cc points to the regmatch_state associated with the most recent CURLYX.
3333 This struct contains info about the innermost (...)* loop (an
3334 "infoblock"), and a pointer to the next outer cc.
2ab05381
IZ
3335
3336 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3337
95b24440 3338 1) After matching Y, regnode for CURLYX is processed;
2ab05381 3339
a0374537 3340 2) This regnode populates cc, and calls regmatch() recursively
95b24440 3341 with the starting point at WHILEM node;
2ab05381
IZ
3342
3343 3) Each hit of WHILEM node tries to match A and Z (in the order
3344 depending on the current iteration, min/max of {min,max} and
3345 greediness). The information about where are nodes for "A"
a0374537 3346 and "Z" is read from cc, as is info on how many times "A"
2ab05381
IZ
3347 was already matched, and greediness.
3348
3349 4) After A matches, the same WHILEM node is hit again.
3350
95b24440 3351 5) Each time WHILEM is hit, cc is the infoblock created by CURLYX
2ab05381 3352 of the same pair. Thus when WHILEM tries to match Z, it temporarily
95b24440 3353 resets cc, since this Y(A)*Z can be a part of some other loop:
2ab05381
IZ
3354 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3355 of the external loop.
3356
a0374537 3357 Currently present infoblocks form a tree with a stem formed by st->cc
2ab05381
IZ
3358 and whatever it mentions via ->next, and additional attached trees
3359 corresponding to temporarily unset infoblocks as in "5" above.
3360
95b24440 3361 In the following picture, infoblocks for outer loop of
2ab05381
IZ
3362 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3363 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3364 infoblocks are drawn below the "reset" infoblock.
3365
3366 In fact in the picture below we do not show failed matches for Z and T
3367 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3368 more obvious *why* one needs to *temporary* unset infoblocks.]
3369
3370 Matched REx position InfoBlocks Comment
3371 (Y(A)*?Z)*?T x
3372 Y(A)*?Z)*?T x <- O
3373 Y (A)*?Z)*?T x <- O
3374 Y A)*?Z)*?T x <- O <- I
3375 YA )*?Z)*?T x <- O <- I
3376 YA A)*?Z)*?T x <- O <- I
3377 YAA )*?Z)*?T x <- O <- I
3378 YAA Z)*?T x <- O # Temporary unset I
3379 I
3380
3381 YAAZ Y(A)*?Z)*?T x <- O
3382 I
3383
3384 YAAZY (A)*?Z)*?T x <- O
3385 I
3386
3387 YAAZY A)*?Z)*?T x <- O <- I
3388 I
3389
3390 YAAZYA )*?Z)*?T x <- O <- I
3391 I
3392
3393 YAAZYA Z)*?T x <- O # Temporary unset I
3394 I,I
3395
3396 YAAZYAZ )*?T x <- O
3397 I,I
3398
3399 YAAZYAZ T x # Temporary unset O
3400 O
3401 I,I
3402
3403 YAAZYAZT x
3404 O
3405 I,I
3406 *******************************************************************/
95b24440 3407
a0d0e21e 3408 case CURLYX: {
cb434fcc
IZ
3409 /* No need to save/restore up to this paren */
3410 I32 parenfloor = scan->flags;
c277df42 3411
c2b7afd3
NC
3412 /* Dave says:
3413
3414 CURLYX and WHILEM are always paired: they're the moral
3415 equivalent of pp_enteriter anbd pp_iter.
3416
3417 The only time next could be null is if the node tree is
3418 corrupt. This was mentioned on p5p a few days ago.
3419
3420 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3421 So we'll assert that this is true:
3422 */
3423 assert(next);
30b2893d 3424 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
c277df42 3425 next += ARG(next);
cb434fcc
IZ
3426 /* XXXX Probably it is better to teach regpush to support
3427 parenfloor > PL_regsize... */
eb160463 3428 if (parenfloor > (I32)*PL_reglastparen)
cb434fcc 3429 parenfloor = *PL_reglastparen; /* Pessimization... */
a0374537 3430
d8319b27
DM
3431 st->u.curlyx.cp = PL_savestack_ix;
3432 st->u.curlyx.outercc = st->cc;
a0374537
DM
3433 st->cc = st;
3434 /* these fields contain the state of the current curly.
3435 * they are accessed by subsequent WHILEMs;
3436 * cur and lastloc are also updated by WHILEM */
d8319b27
DM
3437 st->u.curlyx.parenfloor = parenfloor;
3438 st->u.curlyx.cur = -1; /* this will be updated by WHILEM */
3439 st->u.curlyx.min = ARG1(scan);
3440 st->u.curlyx.max = ARG2(scan);
3441 st->u.curlyx.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3442 st->u.curlyx.lastloc = 0;
a0374537
DM
3443 /* st->next and st->minmod are also read by WHILEM */
3444
3280af22 3445 PL_reginput = locinput;
95b24440
DM
3446 REGMATCH(PREVOPER(next), CURLYX); /* start on the WHILEM */
3447 /*** all unsaved local vars undefined at this point */
d8319b27
DM
3448 regcpblow(st->u.curlyx.cp);
3449 st->cc = st->u.curlyx.outercc;
95b24440 3450 saySAME(result);
a0d0e21e 3451 }
5f66b61c 3452 /* NOTREACHED */
a0d0e21e
LW
3453 case WHILEM: {
3454 /*
3455 * This is really hard to understand, because after we match
3456 * what we're trying to match, we must make sure the rest of
2c2d71f5 3457 * the REx is going to match for sure, and to do that we have
a0d0e21e
LW
3458 * to go back UP the parse tree by recursing ever deeper. And
3459 * if it fails, we have to reset our parent's current state
3460 * that we can try again after backing off.
3461 */
3462
c2b7afd3
NC
3463 /* Dave says:
3464
3465 st->cc gets initialised by CURLYX ready for use by WHILEM.
3466 So again, unless somethings been corrupted, st->cc cannot
3467 be null at that point in WHILEM.
3468
3469 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
3470 So we'll assert that this is true:
3471 */
3472 assert(st->cc);
d8319b27
DM
3473 st->u.whilem.lastloc = st->cc->u.curlyx.lastloc; /* Detection of 0-len. */
3474 st->u.whilem.cache_offset = 0;
3475 st->u.whilem.cache_bit = 0;
c277df42 3476