This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Avoid use of $(...) as backticks in SH code
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
c74f6de9
KW
96#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
e0193e47 102/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
635cd5d4 105#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 106 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
f2ed9b32 112#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 113#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 114
3dab1dad
YO
115#define HOPc(pos,off) \
116 (char *)(PL_reg_match_utf8 \
52657f30 117 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
118 : (U8*)(pos + off))
119#define HOPBACKc(pos, off) \
07be1b83
YO
120 (char*)(PL_reg_match_utf8\
121 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
122 : (pos - off >= PL_bostr) \
8e11feef 123 ? (U8*)pos - off \
3dab1dad 124 : NULL)
efb30f32 125
e7409c1b 126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 127#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 128
7016d6eb
DM
129
130#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
131#define NEXTCHR_IS_EOS (nextchr < 0)
132
133#define SET_nextchr \
134 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
135
136#define SET_locinput(p) \
137 locinput = (p); \
138 SET_nextchr
139
140
c7304fe2
KW
141#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
142 if (!swash_ptr) { \
143 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2
KW
144 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
145 1, 0, NULL, &flags); \
146 assert(swash_ptr); \
147 } \
148 } STMT_END
149
150/* If in debug mode, we test that a known character properly matches */
151#ifdef DEBUGGING
152# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
153 property_name, \
154 utf8_char_in_property) \
155 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
156 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
157#else
158# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
159 property_name, \
160 utf8_char_in_property) \
161 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
162#endif
d1eb3177 163
c7304fe2
KW
164#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
165 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
166 swash_property_names[_CC_WORDCHAR], \
167 GREEK_SMALL_LETTER_IOTA_UTF8)
168
169#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
170 STMT_START { \
171 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
172 "_X_regular_begin", \
173 GREEK_SMALL_LETTER_IOTA_UTF8); \
174 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
175 "_X_extend", \
176 COMBINING_GRAVE_ACCENT_UTF8); \
177 } STMT_END
d1eb3177 178
c7304fe2 179#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
180/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
181
5f80c4cf 182/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
183/* it would be nice to rework regcomp.sym to generate this stuff. sigh
184 *
185 * NOTE that *nothing* that affects backtracking should be in here, specifically
186 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
187 * node that is in between two EXACT like nodes when ascertaining what the required
188 * "follow" character is. This should probably be moved to regex compile time
189 * although it may be done at run time beause of the REF possibility - more
190 * investigation required. -- demerphq
191*/
3e901dc0
YO
192#define JUMPABLE(rn) ( \
193 OP(rn) == OPEN || \
194 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
195 OP(rn) == EVAL || \
cca55fe3
JP
196 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
197 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 198 OP(rn) == KEEPS || \
3dab1dad 199 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 200)
ee9b8eae 201#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 202
ee9b8eae
YO
203#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
204
205#if 0
206/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
207 we don't need this definition. */
208#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 209#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
210#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
211
212#else
213/* ... so we use this as its faster. */
214#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 215#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
216#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
217#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
218
219#endif
e2d8ce26 220
a84d97b6
HS
221/*
222 Search for mandatory following text node; for lookahead, the text must
223 follow but for lookbehind (rn->flags != 0) we skip to the next step.
224*/
cca55fe3 225#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
226 while (JUMPABLE(rn)) { \
227 const OPCODE type = OP(rn); \
228 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 229 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 230 else if (type == PLUS) \
cca55fe3 231 rn = NEXTOPER(rn); \
3dab1dad 232 else if (type == IFMATCH) \
a84d97b6 233 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 234 else rn += NEXT_OFF(rn); \
3dab1dad 235 } \
5f80c4cf 236} STMT_END
74750237 237
22913b96
KW
238/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
239 * These are for the pre-composed Hangul syllables, which are all in a
240 * contiguous block and arranged there in such a way so as to facilitate
241 * alorithmic determination of their characteristics. As such, they don't need
242 * a swash, but can be determined by simple arithmetic. Almost all are
243 * GCB=LVT, but every 28th one is a GCB=LV */
244#define SBASE 0xAC00 /* Start of block */
245#define SCount 11172 /* Length of block */
246#define TCount 28
c476f425 247
acfe0abc 248static void restore_pos(pTHX_ void *arg);
51371543 249
87c0511b 250#define REGCP_PAREN_ELEMS 3
f067efbf 251#define REGCP_OTHER_ELEMS 3
e0fa7e2b 252#define REGCP_FRAME_ELEMS 1
620d5b66
NC
253/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
254 * are needed for the regexp context stack bookkeeping. */
255
76e3520e 256STATIC CHECKPOINT
92da3157 257S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 258{
97aff369 259 dVAR;
a3b680e6 260 const int retval = PL_savestack_ix;
92da3157
DM
261 const int paren_elems_to_push =
262 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
263 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
264 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 265 I32 p;
40a82448 266 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 267
b93070ed
DM
268 PERL_ARGS_ASSERT_REGCPPUSH;
269
e49a9654 270 if (paren_elems_to_push < 0)
5637ef5b
NC
271 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
272 paren_elems_to_push);
e49a9654 273
e0fa7e2b
NC
274 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
275 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 276 " out of range (%lu-%ld)",
92da3157
DM
277 total_elems,
278 (unsigned long)maxopenparen,
279 (long)parenfloor);
e0fa7e2b 280
620d5b66 281 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 282
495f47a5 283 DEBUG_BUFFERS_r(
92da3157 284 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
285 PerlIO_printf(Perl_debug_log,
286 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
287 PTR2UV(rex),
288 PTR2UV(rex->offs)
289 );
290 );
92da3157 291 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 292/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
293 SSPUSHINT(rex->offs[p].end);
294 SSPUSHINT(rex->offs[p].start);
1ca2007e 295 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 296 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
297 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
298 (UV)p,
299 (IV)rex->offs[p].start,
300 (IV)rex->offs[p].start_tmp,
301 (IV)rex->offs[p].end
40a82448 302 ));
a0d0e21e 303 }
b1ce53c5 304/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 305 SSPUSHINT(maxopenparen);
b93070ed
DM
306 SSPUSHINT(rex->lastparen);
307 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 308 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 309
a0d0e21e
LW
310 return retval;
311}
312
c277df42 313/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
314#define REGCP_SET(cp) \
315 DEBUG_STATE_r( \
ab3bbdeb 316 PerlIO_printf(Perl_debug_log, \
e4f74956 317 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
318 (IV)PL_savestack_ix)); \
319 cp = PL_savestack_ix
c3464db5 320
ab3bbdeb 321#define REGCP_UNWIND(cp) \
e4f74956 322 DEBUG_STATE_r( \
ab3bbdeb 323 if (cp != PL_savestack_ix) \
e4f74956
YO
324 PerlIO_printf(Perl_debug_log, \
325 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
326 (IV)(cp), (IV)PL_savestack_ix)); \
327 regcpblow(cp)
c277df42 328
a8d1f4b4
DM
329#define UNWIND_PAREN(lp, lcp) \
330 for (n = rex->lastparen; n > lp; n--) \
331 rex->offs[n].end = -1; \
332 rex->lastparen = n; \
333 rex->lastcloseparen = lcp;
334
335
f067efbf 336STATIC void
92da3157 337S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 338{
97aff369 339 dVAR;
e0fa7e2b 340 UV i;
87c0511b 341 U32 paren;
a3621e74
YO
342 GET_RE_DEBUG_FLAGS_DECL;
343
7918f24d
NC
344 PERL_ARGS_ASSERT_REGCPPOP;
345
b1ce53c5 346 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 347 i = SSPOPUV;
e0fa7e2b
NC
348 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
349 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
350 rex->lastcloseparen = SSPOPINT;
351 rex->lastparen = SSPOPINT;
92da3157 352 *maxopenparen_p = SSPOPINT;
b1ce53c5 353
620d5b66 354 i -= REGCP_OTHER_ELEMS;
b1ce53c5 355 /* Now restore the parentheses context. */
495f47a5
DM
356 DEBUG_BUFFERS_r(
357 if (i || rex->lastparen + 1 <= rex->nparens)
358 PerlIO_printf(Perl_debug_log,
359 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
360 PTR2UV(rex),
361 PTR2UV(rex->offs)
362 );
363 );
92da3157 364 paren = *maxopenparen_p;
620d5b66 365 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 366 I32 tmps;
1ca2007e 367 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 368 rex->offs[paren].start = SSPOPINT;
cf93c79d 369 tmps = SSPOPINT;
b93070ed
DM
370 if (paren <= rex->lastparen)
371 rex->offs[paren].end = tmps;
495f47a5
DM
372 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
373 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
374 (UV)paren,
375 (IV)rex->offs[paren].start,
376 (IV)rex->offs[paren].start_tmp,
377 (IV)rex->offs[paren].end,
378 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 379 );
87c0511b 380 paren--;
a0d0e21e 381 }
daf18116 382#if 1
dafc8851
JH
383 /* It would seem that the similar code in regtry()
384 * already takes care of this, and in fact it is in
385 * a better location to since this code can #if 0-ed out
386 * but the code in regtry() is needed or otherwise tests
387 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
388 * (as of patchlevel 7877) will fail. Then again,
389 * this code seems to be necessary or otherwise
225593e1
DM
390 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
391 * --jhi updated by dapm */
b93070ed 392 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 393 if (i > *maxopenparen_p)
b93070ed
DM
394 rex->offs[i].start = -1;
395 rex->offs[i].end = -1;
495f47a5
DM
396 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
397 " \\%"UVuf": %s ..-1 undeffing\n",
398 (UV)i,
92da3157 399 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 400 ));
a0d0e21e 401 }
dafc8851 402#endif
a0d0e21e
LW
403}
404
74088413
DM
405/* restore the parens and associated vars at savestack position ix,
406 * but without popping the stack */
407
408STATIC void
92da3157 409S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
410{
411 I32 tmpix = PL_savestack_ix;
412 PL_savestack_ix = ix;
92da3157 413 regcppop(rex, maxopenparen_p);
74088413
DM
414 PL_savestack_ix = tmpix;
415}
416
02db2b7b 417#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 418
31c7f561
KW
419STATIC bool
420S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
421{
422 /* Returns a boolean as to whether or not 'character' is a member of the
423 * Posix character class given by 'classnum' that should be equivalent to a
424 * value in the typedef '_char_class_number'.
425 *
426 * Ideally this could be replaced by a just an array of function pointers
427 * to the C library functions that implement the macros this calls.
428 * However, to compile, the precise function signatures are required, and
429 * these may vary from platform to to platform. To avoid having to figure
430 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
431 * which adds an extra layer of function call overhead (unless the C
432 * optimizer strips it away). But we don't particularly care about
433 * performance with locales anyway. */
31c7f561
KW
434
435 switch ((_char_class_number) classnum) {
15861f94 436 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 437 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
438 case _CC_ENUM_ASCII: return isASCII_LC(character);
439 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
440 case _CC_ENUM_CASED: return isLOWER_LC(character)
441 || isUPPER_LC(character);
e8d596e0 442 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
443 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
444 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
445 case _CC_ENUM_LOWER: return isLOWER_LC(character);
446 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 447 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 448 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 449 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
450 case _CC_ENUM_UPPER: return isUPPER_LC(character);
451 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 452 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
453 default: /* VERTSPACE should never occur in locales */
454 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
455 }
456
457 assert(0); /* NOTREACHED */
458 return FALSE;
459}
460
3018b823
KW
461STATIC bool
462S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
463{
464 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
465 * 'character' is a member of the Posix character class given by 'classnum'
466 * that should be equivalent to a value in the typedef
467 * '_char_class_number'.
468 *
469 * This just calls isFOO_lc on the code point for the character if it is in
470 * the range 0-255. Outside that range, all characters avoid Unicode
471 * rules, ignoring any locale. So use the Unicode function if this class
472 * requires a swash, and use the Unicode macro otherwise. */
473
474 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
475
476 if (UTF8_IS_INVARIANT(*character)) {
477 return isFOO_lc(classnum, *character);
478 }
479 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
480 return isFOO_lc(classnum,
481 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
482 }
483
484 if (classnum < _FIRST_NON_SWASH_CC) {
485
486 /* Initialize the swash unless done already */
487 if (! PL_utf8_swash_ptrs[classnum]) {
488 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
489 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
490 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
491 }
492
92a2046b
KW
493 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
494 character,
495 TRUE /* is UTF */ ));
3018b823
KW
496 }
497
498 switch ((_char_class_number) classnum) {
499 case _CC_ENUM_SPACE:
500 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
501
502 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
503 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
504 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
505 default: return 0; /* Things like CNTRL are always
506 below 256 */
507 }
508
509 assert(0); /* NOTREACHED */
510 return FALSE;
511}
512
a687059c 513/*
e50aee73 514 * pregexec and friends
a687059c
LW
515 */
516
76234dfb 517#ifndef PERL_IN_XSUB_RE
a687059c 518/*
c277df42 519 - pregexec - match a regexp against a string
a687059c 520 */
c277df42 521I32
5aaab254 522Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 523 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
524/* stringarg: the point in the string at which to begin matching */
525/* strend: pointer to null at end of string */
526/* strbeg: real beginning of string */
527/* minend: end of match must be >= minend bytes after stringarg. */
528/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
529 * itself is accessed via the pointers above */
530/* nosave: For optimizations. */
c277df42 531{
7918f24d
NC
532 PERL_ARGS_ASSERT_PREGEXEC;
533
c277df42 534 return
9041c2e3 535 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
536 nosave ? 0 : REXEC_COPY_STR);
537}
76234dfb 538#endif
22e551b9 539
9041c2e3 540/*
cad2e5aa
JH
541 * Need to implement the following flags for reg_anch:
542 *
543 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
544 * USE_INTUIT_ML
545 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
546 * INTUIT_AUTORITATIVE_ML
547 * INTUIT_ONCE_NOML - Intuit can match in one location only.
548 * INTUIT_ONCE_ML
549 *
550 * Another flag for this function: SECOND_TIME (so that float substrs
551 * with giant delta may be not rechecked).
552 */
553
554/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
555
3f7c398e 556/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
557 Otherwise, only SvCUR(sv) is used to get strbeg. */
558
559/* XXXX We assume that strpos is strbeg unless sv. */
560
6eb5f6b9
JH
561/* XXXX Some places assume that there is a fixed substring.
562 An update may be needed if optimizer marks as "INTUITable"
563 RExen without fixed substrings. Similarly, it is assumed that
564 lengths of all the strings are no more than minlen, thus they
565 cannot come from lookahead.
40d049e4
YO
566 (Or minlen should take into account lookahead.)
567 NOTE: Some of this comment is not correct. minlen does now take account
568 of lookahead/behind. Further research is required. -- demerphq
569
570*/
6eb5f6b9 571
2c2d71f5
JH
572/* A failure to find a constant substring means that there is no need to make
573 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 574 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
575 regtry() should be needed.
576
577 REx compiler's optimizer found 4 possible hints:
578 a) Anchored substring;
579 b) Fixed substring;
580 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 581 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 582 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
583 string which does not contradict any of them.
584 */
2c2d71f5 585
6eb5f6b9
JH
586/* Most of decisions we do here should have been done at compile time.
587 The nodes of the REx which we used for the search should have been
588 deleted from the finite automaton. */
589
cad2e5aa 590char *
288b8c02 591Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 592 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 593{
97aff369 594 dVAR;
8d919b0a 595 struct regexp *const prog = ReANY(rx);
eb578fdb 596 I32 start_shift = 0;
cad2e5aa 597 /* Should be nonnegative! */
eb578fdb
KW
598 I32 end_shift = 0;
599 char *s;
600 SV *check;
a1933d95 601 char *strbeg;
cad2e5aa 602 char *t;
f2ed9b32 603 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 604 I32 ml_anch;
eb578fdb 605 char *other_last = NULL; /* other substr checked before this */
bd61b366 606 char *check_at = NULL; /* check substr found at this pos */
d8080198 607 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 608 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 609 RXi_GET_DECL(prog,progi);
984e6dd1 610 bool is_utf8_pat;
30944b6d 611#ifdef DEBUGGING
890ce7af 612 const char * const i_strpos = strpos;
30944b6d 613#endif
a3621e74
YO
614 GET_RE_DEBUG_FLAGS_DECL;
615
7918f24d 616 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
617 PERL_UNUSED_ARG(flags);
618 PERL_UNUSED_ARG(data);
7918f24d 619
f2ed9b32 620 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 621
984e6dd1 622 is_utf8_pat = cBOOL(RX_UTF8(rx));
4fab19ce 623
ab3bbdeb 624 DEBUG_EXECUTE_r(
f2ed9b32 625 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
626 sv ? "Guessing start of match in sv for"
627 : "Guessing start of match in string for");
2a782b5b 628 );
cad2e5aa 629
c344f387
JH
630 /* CHR_DIST() would be more correct here but it makes things slow. */
631 if (prog->minlen > strend - strpos) {
a3621e74 632 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 633 "String too short... [re_intuit_start]\n"));
cad2e5aa 634 goto fail;
2c2d71f5 635 }
d8da0584 636
7016d6eb
DM
637 /* XXX we need to pass strbeg as a separate arg: the following is
638 * guesswork and can be wrong... */
639 if (sv && SvPOK(sv)) {
640 char * p = SvPVX(sv);
641 STRLEN cur = SvCUR(sv);
642 if (p <= strpos && strpos < p + cur) {
643 strbeg = p;
644 assert(p <= strend && strend <= p + cur);
645 }
646 else
647 strbeg = strend - cur;
648 }
649 else
650 strbeg = strpos;
651
1aa99e6b 652 PL_regeol = strend;
f2ed9b32 653 if (utf8_target) {
33b8afdf
JH
654 if (!prog->check_utf8 && prog->check_substr)
655 to_utf8_substr(prog);
656 check = prog->check_utf8;
657 } else {
7e0d5ad7
KW
658 if (!prog->check_substr && prog->check_utf8) {
659 if (! to_byte_substr(prog)) {
6b54ddc5 660 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
661 }
662 }
33b8afdf
JH
663 check = prog->check_substr;
664 }
bbe252da
YO
665 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
666 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
667 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 668 && !multiline ) ); /* Check after \n? */
cad2e5aa 669
7e25d62c 670 if (!ml_anch) {
bbe252da
YO
671 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
672 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 673 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
674 && sv && !SvROK(sv)
675 && (strpos != strbeg)) {
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
677 goto fail;
678 }
d46b78c6
KW
679 if (prog->check_offset_min == prog->check_offset_max
680 && !(prog->extflags & RXf_CANY_SEEN)
681 && ! multiline) /* /m can cause \n's to match that aren't
682 accounted for in the string max length.
683 See [perl #115242] */
684 {
2c2d71f5 685 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
686 I32 slen;
687
1aa99e6b 688 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 689
653099ff
GS
690 if (SvTAIL(check)) {
691 slen = SvCUR(check); /* >= 1 */
cad2e5aa 692
9041c2e3 693 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 694 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 695 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 696 goto fail_finish;
cad2e5aa
JH
697 }
698 /* Now should match s[0..slen-2] */
699 slen--;
3f7c398e 700 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 701 || (slen > 1
3f7c398e 702 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 703 report_neq:
a3621e74 704 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
705 goto fail_finish;
706 }
cad2e5aa 707 }
3f7c398e 708 else if (*SvPVX_const(check) != *s
653099ff 709 || ((slen = SvCUR(check)) > 1
3f7c398e 710 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 711 goto report_neq;
c315bfe8 712 check_at = s;
2c2d71f5 713 goto success_at_start;
7e25d62c 714 }
cad2e5aa 715 }
2c2d71f5 716 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 717 s = strpos;
2c2d71f5 718 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
719 end_shift = prog->check_end_shift;
720
2c2d71f5 721 if (!ml_anch) {
a3b680e6 722 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 723 - (SvTAIL(check) != 0);
a3b680e6 724 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
725
726 if (end_shift < eshift)
727 end_shift = eshift;
728 }
cad2e5aa 729 }
2c2d71f5 730 else { /* Can match at random position */
cad2e5aa
JH
731 ml_anch = 0;
732 s = strpos;
1de06328
YO
733 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
734 end_shift = prog->check_end_shift;
735
736 /* end shift should be non negative here */
cad2e5aa
JH
737 }
738
bcdf7404 739#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 740 if (end_shift < 0)
1de06328 741 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 742 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
743#endif
744
2c2d71f5
JH
745 restart:
746 /* Find a possible match in the region s..strend by looking for
747 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
748
749 {
750 I32 srch_start_shift = start_shift;
751 I32 srch_end_shift = end_shift;
c33e64f0
FC
752 U8* start_point;
753 U8* end_point;
1de06328
YO
754 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
755 srch_end_shift -= ((strbeg - s) - srch_start_shift);
756 srch_start_shift = strbeg - s;
757 }
6bda09f9 758 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
759 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
760 (IV)prog->check_offset_min,
761 (IV)srch_start_shift,
762 (IV)srch_end_shift,
763 (IV)prog->check_end_shift);
764 });
765
bbe252da 766 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
767 start_point= (U8*)(s + srch_start_shift);
768 end_point= (U8*)(strend - srch_end_shift);
769 } else {
770 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
771 end_point= HOP3(strend, -srch_end_shift, strbeg);
772 }
6bda09f9 773 DEBUG_OPTIMISE_MORE_r({
56570a2c 774 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 775 (int)(end_point - start_point),
fc8cd66c 776 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
777 start_point);
778 });
779
780 s = fbm_instr( start_point, end_point,
7fba1cd6 781 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 782 }
cad2e5aa
JH
783 /* Update the count-of-usability, remove useless subpatterns,
784 unshift s. */
2c2d71f5 785
ab3bbdeb 786 DEBUG_EXECUTE_r({
f2ed9b32 787 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
788 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
789 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 790 (s ? "Found" : "Did not find"),
f2ed9b32 791 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
792 ? "anchored" : "floating"),
793 quoted,
794 RE_SV_TAIL(check),
795 (s ? " at offset " : "...\n") );
796 });
2c2d71f5
JH
797
798 if (!s)
799 goto fail_finish;
2c2d71f5 800 /* Finish the diagnostic message */
a3621e74 801 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 802
1de06328
YO
803 /* XXX dmq: first branch is for positive lookbehind...
804 Our check string is offset from the beginning of the pattern.
805 So we need to do any stclass tests offset forward from that
806 point. I think. :-(
807 */
808
809
810
811 check_at=s;
812
813
2c2d71f5
JH
814 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
815 Start with the other substr.
816 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 817 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
818 *always* match. Probably should be marked during compile...
819 Probably it is right to do no SCREAM here...
820 */
821
f2ed9b32 822 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
823 : (prog->float_substr && prog->anchored_substr))
824 {
30944b6d 825 /* Take into account the "other" substring. */
2c2d71f5
JH
826 /* XXXX May be hopelessly wrong for UTF... */
827 if (!other_last)
6eb5f6b9 828 other_last = strpos;
f2ed9b32 829 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
830 do_other_anchored:
831 {
890ce7af
AL
832 char * const last = HOP3c(s, -start_shift, strbeg);
833 char *last1, *last2;
be8e71aa 834 char * const saved_s = s;
33b8afdf 835 SV* must;
2c2d71f5 836
2c2d71f5
JH
837 t = s - prog->check_offset_max;
838 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 839 && (!utf8_target
0ce71af7 840 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 841 && t > strpos)))
6f207bd3 842 NOOP;
2c2d71f5
JH
843 else
844 t = strpos;
1aa99e6b 845 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
846 if (t < other_last) /* These positions already checked */
847 t = other_last;
1aa99e6b 848 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
849 if (last < last1)
850 last1 = last;
1de06328
YO
851 /* XXXX It is not documented what units *_offsets are in.
852 We assume bytes, but this is clearly wrong.
853 Meaning this code needs to be carefully reviewed for errors.
854 dmq.
855 */
856
2c2d71f5 857 /* On end-of-str: see comment below. */
f2ed9b32 858 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
859 if (must == &PL_sv_undef) {
860 s = (char*)NULL;
1de06328 861 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
862 }
863 else
864 s = fbm_instr(
865 (unsigned char*)t,
866 HOP3(HOP3(last1, prog->anchored_offset, strend)
867 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
868 must,
7fba1cd6 869 multiline ? FBMrf_MULTILINE : 0
33b8afdf 870 );
ab3bbdeb 871 DEBUG_EXECUTE_r({
f2ed9b32 872 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
873 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
874 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 875 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
876 quoted, RE_SV_TAIL(must));
877 });
878
879
2c2d71f5
JH
880 if (!s) {
881 if (last1 >= last2) {
a3621e74 882 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
883 ", giving up...\n"));
884 goto fail_finish;
885 }
a3621e74 886 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 887 ", trying floating at offset %ld...\n",
be8e71aa 888 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
889 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
890 s = HOP3c(last, 1, strend);
2c2d71f5
JH
891 goto restart;
892 }
893 else {
a3621e74 894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 895 (long)(s - i_strpos)));
1aa99e6b
IH
896 t = HOP3c(s, -prog->anchored_offset, strbeg);
897 other_last = HOP3c(s, 1, strend);
be8e71aa 898 s = saved_s;
2c2d71f5
JH
899 if (t == strpos)
900 goto try_at_start;
2c2d71f5
JH
901 goto try_at_offset;
902 }
30944b6d 903 }
2c2d71f5
JH
904 }
905 else { /* Take into account the floating substring. */
33b8afdf 906 char *last, *last1;
be8e71aa 907 char * const saved_s = s;
33b8afdf
JH
908 SV* must;
909
910 t = HOP3c(s, -start_shift, strbeg);
911 last1 = last =
912 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
913 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
914 last = HOP3c(t, prog->float_max_offset, strend);
915 s = HOP3c(t, prog->float_min_offset, strend);
916 if (s < other_last)
917 s = other_last;
2c2d71f5 918 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 919 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
920 /* fbm_instr() takes into account exact value of end-of-str
921 if the check is SvTAIL(ed). Since false positives are OK,
922 and end-of-str is not later than strend we are OK. */
923 if (must == &PL_sv_undef) {
924 s = (char*)NULL;
1de06328 925 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
926 }
927 else
2c2d71f5 928 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
929 (unsigned char*)last + SvCUR(must)
930 - (SvTAIL(must)!=0),
7fba1cd6 931 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 932 DEBUG_EXECUTE_r({
f2ed9b32 933 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
934 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
935 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 936 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
937 quoted, RE_SV_TAIL(must));
938 });
33b8afdf
JH
939 if (!s) {
940 if (last1 == last) {
a3621e74 941 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
942 ", giving up...\n"));
943 goto fail_finish;
2c2d71f5 944 }
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 946 ", trying anchored starting at offset %ld...\n",
be8e71aa 947 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
948 other_last = last;
949 s = HOP3c(t, 1, strend);
950 goto restart;
951 }
952 else {
a3621e74 953 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
954 (long)(s - i_strpos)));
955 other_last = s; /* Fix this later. --Hugo */
be8e71aa 956 s = saved_s;
33b8afdf
JH
957 if (t == strpos)
958 goto try_at_start;
959 goto try_at_offset;
960 }
2c2d71f5 961 }
cad2e5aa 962 }
2c2d71f5 963
1de06328 964
9ef43ace 965 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 966
6bda09f9 967 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
968 PerlIO_printf(Perl_debug_log,
969 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
970 (IV)prog->check_offset_min,
971 (IV)prog->check_offset_max,
972 (IV)(s-strpos),
973 (IV)(t-strpos),
974 (IV)(t-s),
975 (IV)(strend-strpos)
976 )
977 );
978
2c2d71f5 979 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 980 && (!utf8_target
9ef43ace 981 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
982 && t > strpos)))
983 {
2c2d71f5
JH
984 /* Fixed substring is found far enough so that the match
985 cannot start at strpos. */
986 try_at_offset:
cad2e5aa 987 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
988 /* Eventually fbm_*() should handle this, but often
989 anchored_offset is not 0, so this check will not be wasted. */
990 /* XXXX In the code below we prefer to look for "^" even in
991 presence of anchored substrings. And we search even
992 beyond the found float position. These pessimizations
993 are historical artefacts only. */
994 find_anchor:
2c2d71f5 995 while (t < strend - prog->minlen) {
cad2e5aa 996 if (*t == '\n') {
4ee3650e 997 if (t < check_at - prog->check_offset_min) {
f2ed9b32 998 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
999 /* Since we moved from the found position,
1000 we definitely contradict the found anchored
30944b6d
IZ
1001 substr. Due to the above check we do not
1002 contradict "check" substr.
1003 Thus we can arrive here only if check substr
1004 is float. Redo checking for "other"=="fixed".
1005 */
9041c2e3 1006 strpos = t + 1;
a3621e74 1007 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1008 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1009 goto do_other_anchored;
1010 }
4ee3650e
GS
1011 /* We don't contradict the found floating substring. */
1012 /* XXXX Why not check for STCLASS? */
cad2e5aa 1013 s = t + 1;
a3621e74 1014 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1015 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1016 goto set_useful;
1017 }
4ee3650e
GS
1018 /* Position contradicts check-string */
1019 /* XXXX probably better to look for check-string
1020 than for "\n", so one should lower the limit for t? */
a3621e74 1021 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1022 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1023 other_last = strpos = s = t + 1;
cad2e5aa
JH
1024 goto restart;
1025 }
1026 t++;
1027 }
a3621e74 1028 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1029 PL_colors[0], PL_colors[1]));
2c2d71f5 1030 goto fail_finish;
cad2e5aa 1031 }
f5952150 1032 else {
a3621e74 1033 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1034 PL_colors[0], PL_colors[1]));
f5952150 1035 }
cad2e5aa
JH
1036 s = t;
1037 set_useful:
f2ed9b32 1038 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1039 }
1040 else {
f5952150 1041 /* The found string does not prohibit matching at strpos,
2c2d71f5 1042 - no optimization of calling REx engine can be performed,
f5952150
GS
1043 unless it was an MBOL and we are not after MBOL,
1044 or a future STCLASS check will fail this. */
2c2d71f5
JH
1045 try_at_start:
1046 /* Even in this situation we may use MBOL flag if strpos is offset
1047 wrt the start of the string. */
05b4157f 1048 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1049 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1050 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1051 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1052 {
cad2e5aa
JH
1053 t = strpos;
1054 goto find_anchor;
1055 }
a3621e74 1056 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1057 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1058 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1059 );
2c2d71f5 1060 success_at_start:
bbe252da 1061 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1062 && (utf8_target ? (
33b8afdf
JH
1063 prog->check_utf8 /* Could be deleted already */
1064 && --BmUSEFUL(prog->check_utf8) < 0
1065 && (prog->check_utf8 == prog->float_utf8)
1066 ) : (
1067 prog->check_substr /* Could be deleted already */
1068 && --BmUSEFUL(prog->check_substr) < 0
1069 && (prog->check_substr == prog->float_substr)
1070 )))
66e933ab 1071 {
cad2e5aa 1072 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1073 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1074 /* XXX Does the destruction order has to change with utf8_target? */
1075 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1076 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1077 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1078 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1079 check = NULL; /* abort */
cad2e5aa 1080 s = strpos;
486ec47a 1081 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1082 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1083 if (prog->intflags & PREGf_IMPLICIT)
1084 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1085 /* XXXX This is a remnant of the old implementation. It
1086 looks wasteful, since now INTUIT can use many
6eb5f6b9 1087 other heuristics. */
bbe252da 1088 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1089 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1090 }
1091 else
1092 s = strpos;
1093 }
1094
6eb5f6b9
JH
1095 /* Last resort... */
1096 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1097 /* trie stclasses are too expensive to use here, we are better off to
1098 leave it to regmatch itself */
f8fc2ecf 1099 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1100 /* minlen == 0 is possible if regstclass is \b or \B,
1101 and the fixed substr is ''$.
1102 Since minlen is already taken into account, s+1 is before strend;
1103 accidentally, minlen >= 1 guaranties no false positives at s + 1
1104 even for \b or \B. But (minlen? 1 : 0) below assumes that
1105 regstclass does not come from lookahead... */
1106 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1107 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1108 const U8* const str = (U8*)STRING(progi->regstclass);
1109 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1110 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1111 : 1);
1de06328
YO
1112 char * endpos;
1113 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1114 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1115 else if (prog->float_substr || prog->float_utf8)
1116 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1117 else
1118 endpos= strend;
1119
d8080198
YO
1120 if (checked_upto < s)
1121 checked_upto = s;
1122 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1123 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1124
6eb5f6b9 1125 t = s;
984e6dd1
DM
1126 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1127 NULL, is_utf8_pat);
d8080198
YO
1128 if (s) {
1129 checked_upto = s;
1130 } else {
6eb5f6b9 1131#ifdef DEBUGGING
cbbf8932 1132 const char *what = NULL;
6eb5f6b9
JH
1133#endif
1134 if (endpos == strend) {
a3621e74 1135 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1136 "Could not match STCLASS...\n") );
1137 goto fail;
1138 }
a3621e74 1139 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1140 "This position contradicts STCLASS...\n") );
bbe252da 1141 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1142 goto fail;
d8080198
YO
1143 checked_upto = HOPBACKc(endpos, start_shift);
1144 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1145 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1146 /* Contradict one of substrings */
33b8afdf 1147 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1148 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1149 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1150 hop_and_restart:
1aa99e6b 1151 s = HOP3c(t, 1, strend);
66e933ab
GS
1152 if (s + start_shift + end_shift > strend) {
1153 /* XXXX Should be taken into account earlier? */
a3621e74 1154 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1155 "Could not match STCLASS...\n") );
1156 goto fail;
1157 }
5e39e1e5
HS
1158 if (!check)
1159 goto giveup;
a3621e74 1160 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1161 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1162 what, (long)(s + start_shift - i_strpos)) );
1163 goto restart;
1164 }
66e933ab 1165 /* Have both, check_string is floating */
6eb5f6b9
JH
1166 if (t + start_shift >= check_at) /* Contradicts floating=check */
1167 goto retry_floating_check;
1168 /* Recheck anchored substring, but not floating... */
9041c2e3 1169 s = check_at;
5e39e1e5
HS
1170 if (!check)
1171 goto giveup;
a3621e74 1172 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1173 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1174 (long)(other_last - i_strpos)) );
1175 goto do_other_anchored;
1176 }
60e71179
GS
1177 /* Another way we could have checked stclass at the
1178 current position only: */
1179 if (ml_anch) {
1180 s = t = t + 1;
5e39e1e5
HS
1181 if (!check)
1182 goto giveup;
a3621e74 1183 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1184 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1185 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1186 goto try_at_offset;
66e933ab 1187 }
f2ed9b32 1188 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1189 goto fail;
486ec47a 1190 /* Check is floating substring. */
6eb5f6b9
JH
1191 retry_floating_check:
1192 t = check_at - start_shift;
a3621e74 1193 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1194 goto hop_and_restart;
1195 }
b7953727 1196 if (t != s) {
a3621e74 1197 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1198 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1199 (long)(t - i_strpos), (long)(s - i_strpos))
1200 );
1201 }
1202 else {
a3621e74 1203 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1204 "Does not contradict STCLASS...\n");
1205 );
1206 }
6eb5f6b9 1207 }
5e39e1e5 1208 giveup:
a3621e74 1209 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1210 PL_colors[4], (check ? "Guessed" : "Giving up"),
1211 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1212 return s;
2c2d71f5
JH
1213
1214 fail_finish: /* Substring not found */
33b8afdf 1215 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1216 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1217 fail:
a3621e74 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1219 PL_colors[4], PL_colors[5]));
bd61b366 1220 return NULL;
cad2e5aa 1221}
9661b544 1222
a0a388a1
YO
1223#define DECL_TRIE_TYPE(scan) \
1224 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1225 trie_type = ((scan->flags == EXACT) \
1226 ? (utf8_target ? trie_utf8 : trie_plain) \
1227 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1228
fd3249ee
YO
1229#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1230STMT_START { \
fab2782b
YO
1231 STRLEN skiplen; \
1232 switch (trie_type) { \
1233 case trie_utf8_fold: \
1234 if ( foldlen>0 ) { \
1235 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1236 foldlen -= len; \
1237 uscan += len; \
1238 len=0; \
1239 } else { \
1240 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1241 len = UTF8SKIP(uc); \
1242 skiplen = UNISKIP( uvc ); \
1243 foldlen -= skiplen; \
1244 uscan = foldbuf + skiplen; \
1245 } \
1246 break; \
1247 case trie_latin_utf8_fold: \
1248 if ( foldlen>0 ) { \
1249 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1250 foldlen -= len; \
1251 uscan += len; \
1252 len=0; \
1253 } else { \
1254 len = 1; \
1255 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1256 skiplen = UNISKIP( uvc ); \
1257 foldlen -= skiplen; \
1258 uscan = foldbuf + skiplen; \
1259 } \
1260 break; \
1261 case trie_utf8: \
1262 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1263 break; \
1264 case trie_plain: \
1265 uvc = (UV)*uc; \
1266 len = 1; \
1267 } \
1268 if (uvc < 256) { \
1269 charid = trie->charmap[ uvc ]; \
1270 } \
1271 else { \
1272 charid = 0; \
1273 if (widecharmap) { \
1274 SV** const svpp = hv_fetch(widecharmap, \
1275 (char*)&uvc, sizeof(UV), 0); \
1276 if (svpp) \
1277 charid = (U16)SvIV(*svpp); \
1278 } \
1279 } \
4cadc6a9
YO
1280} STMT_END
1281
4cadc6a9
YO
1282#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1283STMT_START { \
1284 while (s <= e) { \
1285 if ( (CoNd) \
fac1af77 1286 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1287 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1288 goto got_it; \
1289 s++; \
1290 } \
1291} STMT_END
1292
1293#define REXEC_FBC_UTF8_SCAN(CoDe) \
1294STMT_START { \
9a902117 1295 while (s < strend) { \
4cadc6a9 1296 CoDe \
9a902117 1297 s += UTF8SKIP(s); \
4cadc6a9
YO
1298 } \
1299} STMT_END
1300
1301#define REXEC_FBC_SCAN(CoDe) \
1302STMT_START { \
1303 while (s < strend) { \
1304 CoDe \
1305 s++; \
1306 } \
1307} STMT_END
1308
1309#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1310REXEC_FBC_UTF8_SCAN( \
1311 if (CoNd) { \
7aee35ff 1312 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1313 goto got_it; \
1314 else \
1315 tmp = doevery; \
1316 } \
1317 else \
1318 tmp = 1; \
1319)
1320
1321#define REXEC_FBC_CLASS_SCAN(CoNd) \
1322REXEC_FBC_SCAN( \
1323 if (CoNd) { \
24b23f37 1324 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1325 goto got_it; \
1326 else \
1327 tmp = doevery; \
1328 } \
1329 else \
1330 tmp = 1; \
1331)
1332
1333#define REXEC_FBC_TRYIT \
24b23f37 1334if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1335 goto got_it
1336
e1d1eefb 1337#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1338 if (utf8_target) { \
e1d1eefb
YO
1339 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1340 } \
1341 else { \
1342 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1343 }
e1d1eefb 1344
786e8c11
YO
1345#define DUMP_EXEC_POS(li,s,doutf8) \
1346 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1347
cfaf538b
KW
1348
1349#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1350 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1351 tmp = TEST_NON_UTF8(tmp); \
1352 REXEC_FBC_UTF8_SCAN( \
1353 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1354 tmp = !tmp; \
1355 IF_SUCCESS; \
1356 } \
1357 else { \
1358 IF_FAIL; \
1359 } \
1360 ); \
1361
1362#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1363 if (s == PL_bostr) { \
1364 tmp = '\n'; \
1365 } \
1366 else { \
1367 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1368 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1369 } \
1370 tmp = TeSt1_UtF8; \
1371 LOAD_UTF8_CHARCLASS_ALNUM(); \
1372 REXEC_FBC_UTF8_SCAN( \
1373 if (tmp == ! (TeSt2_UtF8)) { \
1374 tmp = !tmp; \
1375 IF_SUCCESS; \
1376 } \
1377 else { \
1378 IF_FAIL; \
1379 } \
1380 ); \
1381
63ac0dad
KW
1382/* The only difference between the BOUND and NBOUND cases is that
1383 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1384 * NBOUND. This is accomplished by passing it in either the if or else clause,
1385 * with the other one being empty */
1386#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1387 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1388
1389#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1390 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1391
1392#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1393 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1394
1395#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1396 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1397
63ac0dad
KW
1398
1399/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1400 * be passed in completely with the variable name being tested, which isn't
1401 * such a clean interface, but this is easier to read than it was before. We
1402 * are looking for the boundary (or non-boundary between a word and non-word
1403 * character. The utf8 and non-utf8 cases have the same logic, but the details
1404 * must be different. Find the "wordness" of the character just prior to this
1405 * one, and compare it with the wordness of this one. If they differ, we have
1406 * a boundary. At the beginning of the string, pretend that the previous
1407 * character was a new-line */
cfaf538b 1408#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1409 if (utf8_target) { \
cfaf538b 1410 UTF8_CODE \
63ac0dad
KW
1411 } \
1412 else { /* Not utf8 */ \
1413 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1414 tmp = TEST_NON_UTF8(tmp); \
1415 REXEC_FBC_SCAN( \
1416 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1417 tmp = !tmp; \
1418 IF_SUCCESS; \
1419 } \
1420 else { \
1421 IF_FAIL; \
1422 } \
1423 ); \
1424 } \
1425 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1426 goto got_it;
1427
786e8c11
YO
1428/* We know what class REx starts with. Try to find this position... */
1429/* if reginfo is NULL, its a dryrun */
1430/* annoyingly all the vars in this routine have different names from their counterparts
1431 in regmatch. /grrr */
1432
3c3eec57 1433STATIC char *
07be1b83 1434S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
984e6dd1 1435 const char *strend, regmatch_info *reginfo, bool is_utf8_pat)
a687059c 1436{
73104a1b
KW
1437 dVAR;
1438 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1439 char *pat_string; /* The pattern's exactish string */
1440 char *pat_end; /* ptr to end char of pat_string */
1441 re_fold_t folder; /* Function for computing non-utf8 folds */
1442 const U8 *fold_array; /* array for folding ords < 256 */
1443 STRLEN ln;
1444 STRLEN lnc;
73104a1b
KW
1445 U8 c1;
1446 U8 c2;
1447 char *e;
1448 I32 tmp = 1; /* Scratch variable? */
1449 const bool utf8_target = PL_reg_match_utf8;
1450 UV utf8_fold_flags = 0;
3018b823
KW
1451 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1452 with a result inverts that result, as 0^1 =
1453 1 and 1^1 = 0 */
1454 _char_class_number classnum;
1455
73104a1b 1456 RXi_GET_DECL(prog,progi);
2f7f8cb1 1457
73104a1b 1458 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1459
73104a1b
KW
1460 /* We know what class it must start with. */
1461 switch (OP(c)) {
1462 case ANYOF:
9aa1e39f 1463 case ANYOF_SYNTHETIC:
954a2af6 1464 case ANYOF_WARN_SUPER:
73104a1b
KW
1465 if (utf8_target) {
1466 REXEC_FBC_UTF8_CLASS_SCAN(
1467 reginclass(prog, c, (U8*)s, utf8_target));
1468 }
1469 else {
1470 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1471 }
1472 break;
1473 case CANY:
1474 REXEC_FBC_SCAN(
1475 if (tmp && (!reginfo || regtry(reginfo, &s)))
1476 goto got_it;
1477 else
1478 tmp = doevery;
1479 );
1480 break;
1481
1482 case EXACTFA:
984e6dd1 1483 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1484 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1485 goto do_exactf_utf8;
1486 }
1487 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1488 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1489 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1490
73104a1b
KW
1491 case EXACTF:
1492 if (utf8_target) {
16d951b7 1493
73104a1b
KW
1494 /* regcomp.c already folded this if pattern is in UTF-8 */
1495 utf8_fold_flags = 0;
1496 goto do_exactf_utf8;
1497 }
1498 fold_array = PL_fold;
1499 folder = foldEQ;
1500 goto do_exactf_non_utf8;
1501
1502 case EXACTFL:
984e6dd1 1503 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1504 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1505 goto do_exactf_utf8;
1506 }
1507 fold_array = PL_fold_locale;
1508 folder = foldEQ_locale;
1509 goto do_exactf_non_utf8;
3c760661 1510
73104a1b 1511 case EXACTFU_SS:
984e6dd1 1512 if (is_utf8_pat) {
73104a1b
KW
1513 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1514 }
1515 goto do_exactf_utf8;
16d951b7 1516
73104a1b
KW
1517 case EXACTFU_TRICKYFOLD:
1518 case EXACTFU:
984e6dd1
DM
1519 if (is_utf8_pat || utf8_target) {
1520 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1521 goto do_exactf_utf8;
1522 }
fac1af77 1523
73104a1b
KW
1524 /* Any 'ss' in the pattern should have been replaced by regcomp,
1525 * so we don't have to worry here about this single special case
1526 * in the Latin1 range */
1527 fold_array = PL_fold_latin1;
1528 folder = foldEQ_latin1;
1529
1530 /* FALL THROUGH */
1531
1532 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1533 are no glitches with fold-length differences
1534 between the target string and pattern */
1535
1536 /* The idea in the non-utf8 EXACTF* cases is to first find the
1537 * first character of the EXACTF* node and then, if necessary,
1538 * case-insensitively compare the full text of the node. c1 is the
1539 * first character. c2 is its fold. This logic will not work for
1540 * Unicode semantics and the german sharp ss, which hence should
1541 * not be compiled into a node that gets here. */
1542 pat_string = STRING(c);
1543 ln = STR_LEN(c); /* length to match in octets/bytes */
1544
1545 /* We know that we have to match at least 'ln' bytes (which is the
1546 * same as characters, since not utf8). If we have to match 3
1547 * characters, and there are only 2 availabe, we know without
1548 * trying that it will fail; so don't start a match past the
1549 * required minimum number from the far end */
1550 e = HOP3c(strend, -((I32)ln), s);
1551
1552 if (!reginfo && e < s) {
1553 e = s; /* Due to minlen logic of intuit() */
1554 }
fac1af77 1555
73104a1b
KW
1556 c1 = *pat_string;
1557 c2 = fold_array[c1];
1558 if (c1 == c2) { /* If char and fold are the same */
1559 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1560 }
1561 else {
1562 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1563 }
1564 break;
fac1af77 1565
73104a1b
KW
1566 do_exactf_utf8:
1567 {
1568 unsigned expansion;
1569
1570 /* If one of the operands is in utf8, we can't use the simpler folding
1571 * above, due to the fact that many different characters can have the
1572 * same fold, or portion of a fold, or different- length fold */
1573 pat_string = STRING(c);
1574 ln = STR_LEN(c); /* length to match in octets/bytes */
1575 pat_end = pat_string + ln;
984e6dd1 1576 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1577 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1578 : ln;
1579
1580 /* We have 'lnc' characters to match in the pattern, but because of
1581 * multi-character folding, each character in the target can match
1582 * up to 3 characters (Unicode guarantees it will never exceed
1583 * this) if it is utf8-encoded; and up to 2 if not (based on the
1584 * fact that the Latin 1 folds are already determined, and the
1585 * only multi-char fold in that range is the sharp-s folding to
1586 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1587 * string character. Adjust lnc accordingly, rounding up, so that
1588 * if we need to match at least 4+1/3 chars, that really is 5. */
1589 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1590 lnc = (lnc + expansion - 1) / expansion;
1591
1592 /* As in the non-UTF8 case, if we have to match 3 characters, and
1593 * only 2 are left, it's guaranteed to fail, so don't start a
1594 * match that would require us to go beyond the end of the string
1595 */
1596 e = HOP3c(strend, -((I32)lnc), s);
1597
1598 if (!reginfo && e < s) {
1599 e = s; /* Due to minlen logic of intuit() */
1600 }
0658cdde 1601
73104a1b
KW
1602 /* XXX Note that we could recalculate e to stop the loop earlier,
1603 * as the worst case expansion above will rarely be met, and as we
1604 * go along we would usually find that e moves further to the left.
1605 * This would happen only after we reached the point in the loop
1606 * where if there were no expansion we should fail. Unclear if
1607 * worth the expense */
1608
1609 while (s <= e) {
1610 char *my_strend= (char *)strend;
1611 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1612 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
73104a1b
KW
1613 && (!reginfo || regtry(reginfo, &s)) )
1614 {
1615 goto got_it;
1616 }
1617 s += (utf8_target) ? UTF8SKIP(s) : 1;
1618 }
1619 break;
1620 }
1621 case BOUNDL:
272d35c9 1622 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1623 FBC_BOUND(isWORDCHAR_LC,
1624 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1626 break;
1627 case NBOUNDL:
272d35c9 1628 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1629 FBC_NBOUND(isWORDCHAR_LC,
1630 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1632 break;
1633 case BOUND:
1634 FBC_BOUND(isWORDCHAR,
0eb30aeb 1635 isWORDCHAR_uni(tmp),
03940dc2 1636 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1637 break;
1638 case BOUNDA:
1639 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1640 isWORDCHAR_A(tmp),
1641 isWORDCHAR_A((U8*)s));
1642 break;
1643 case NBOUND:
1644 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1645 isWORDCHAR_uni(tmp),
03940dc2 1646 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1647 break;
1648 case NBOUNDA:
1649 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1650 isWORDCHAR_A(tmp),
1651 isWORDCHAR_A((U8*)s));
1652 break;
1653 case BOUNDU:
1654 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1655 isWORDCHAR_uni(tmp),
03940dc2 1656 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1657 break;
1658 case NBOUNDU:
1659 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1660 isWORDCHAR_uni(tmp),
03940dc2 1661 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1662 break;
73104a1b
KW
1663 case LNBREAK:
1664 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1665 is_LNBREAK_latin1_safe(s, strend)
1666 );
1667 break;
3018b823
KW
1668
1669 /* The argument to all the POSIX node types is the class number to pass to
1670 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1671
1672 case NPOSIXL:
1673 to_complement = 1;
1674 /* FALLTHROUGH */
1675
1676 case POSIXL:
272d35c9 1677 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1678 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1679 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1680 break;
3018b823
KW
1681
1682 case NPOSIXD:
1683 to_complement = 1;
1684 /* FALLTHROUGH */
1685
1686 case POSIXD:
1687 if (utf8_target) {
1688 goto posix_utf8;
1689 }
1690 goto posixa;
1691
1692 case NPOSIXA:
1693 if (utf8_target) {
1694 /* The complement of something that matches only ASCII matches all
1695 * UTF-8 variant code points, plus everything in ASCII that isn't
1696 * in the class */
1697 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1698 || ! _generic_isCC_A(*s, FLAGS(c)));
1699 break;
1700 }
1701
1702 to_complement = 1;
1703 /* FALLTHROUGH */
1704
73104a1b 1705 case POSIXA:
3018b823 1706 posixa:
73104a1b 1707 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1708 * byte invariant character. */
1709 REXEC_FBC_CLASS_SCAN(
1710 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1711 break;
3018b823
KW
1712
1713 case NPOSIXU:
1714 to_complement = 1;
1715 /* FALLTHROUGH */
1716
1717 case POSIXU:
1718 if (! utf8_target) {
1719 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1720 FLAGS(c))));
1721 }
1722 else {
1723
1724 posix_utf8:
1725 classnum = (_char_class_number) FLAGS(c);
1726 if (classnum < _FIRST_NON_SWASH_CC) {
1727 while (s < strend) {
1728
1729 /* We avoid loading in the swash as long as possible, but
1730 * should we have to, we jump to a separate loop. This
1731 * extra 'if' statement is what keeps this code from being
1732 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1733 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1734 goto found_above_latin1;
1735 }
1736 if ((UTF8_IS_INVARIANT(*s)
1737 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1738 classnum)))
1739 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1740 && to_complement ^ cBOOL(
1741 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1742 classnum))))
1743 {
1744 if (tmp && (!reginfo || regtry(reginfo, &s)))
1745 goto got_it;
1746 else {
1747 tmp = doevery;
1748 }
1749 }
1750 else {
1751 tmp = 1;
1752 }
1753 s += UTF8SKIP(s);
1754 }
1755 }
1756 else switch (classnum) { /* These classes are implemented as
1757 macros */
1758 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1759 revert the change of \v matching this */
1760 /* FALL THROUGH */
1761
1762 case _CC_ENUM_PSXSPC:
1763 REXEC_FBC_UTF8_CLASS_SCAN(
1764 to_complement ^ cBOOL(isSPACE_utf8(s)));
1765 break;
1766
1767 case _CC_ENUM_BLANK:
1768 REXEC_FBC_UTF8_CLASS_SCAN(
1769 to_complement ^ cBOOL(isBLANK_utf8(s)));
1770 break;
1771
1772 case _CC_ENUM_XDIGIT:
1773 REXEC_FBC_UTF8_CLASS_SCAN(
1774 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1775 break;
1776
1777 case _CC_ENUM_VERTSPACE:
1778 REXEC_FBC_UTF8_CLASS_SCAN(
1779 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1780 break;
1781
1782 case _CC_ENUM_CNTRL:
1783 REXEC_FBC_UTF8_CLASS_SCAN(
1784 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1785 break;
1786
1787 default:
1788 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1789 assert(0); /* NOTREACHED */
1790 }
1791 }
1792 break;
1793
1794 found_above_latin1: /* Here we have to load a swash to get the result
1795 for the current code point */
1796 if (! PL_utf8_swash_ptrs[classnum]) {
1797 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1798 PL_utf8_swash_ptrs[classnum] =
1799 _core_swash_init("utf8", swash_property_names[classnum],
1800 &PL_sv_undef, 1, 0, NULL, &flags);
1801 }
1802
1803 /* This is a copy of the loop above for swash classes, though using the
1804 * FBC macro instead of being expanded out. Since we've loaded the
1805 * swash, we don't have to check for that each time through the loop */
1806 REXEC_FBC_UTF8_CLASS_SCAN(
1807 to_complement ^ cBOOL(_generic_utf8(
1808 classnum,
1809 s,
1810 swash_fetch(PL_utf8_swash_ptrs[classnum],
1811 (U8 *) s, TRUE))));
73104a1b
KW
1812 break;
1813
1814 case AHOCORASICKC:
1815 case AHOCORASICK:
1816 {
1817 DECL_TRIE_TYPE(c);
1818 /* what trie are we using right now */
1819 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1820 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1821 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1822
1823 const char *last_start = strend - trie->minlen;
6148ee25 1824#ifdef DEBUGGING
73104a1b 1825 const char *real_start = s;
6148ee25 1826#endif
73104a1b
KW
1827 STRLEN maxlen = trie->maxlen;
1828 SV *sv_points;
1829 U8 **points; /* map of where we were in the input string
1830 when reading a given char. For ASCII this
1831 is unnecessary overhead as the relationship
1832 is always 1:1, but for Unicode, especially
1833 case folded Unicode this is not true. */
1834 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1835 U8 *bitmap=NULL;
1836
1837
1838 GET_RE_DEBUG_FLAGS_DECL;
1839
1840 /* We can't just allocate points here. We need to wrap it in
1841 * an SV so it gets freed properly if there is a croak while
1842 * running the match */
1843 ENTER;
1844 SAVETMPS;
1845 sv_points=newSV(maxlen * sizeof(U8 *));
1846 SvCUR_set(sv_points,
1847 maxlen * sizeof(U8 *));
1848 SvPOK_on(sv_points);
1849 sv_2mortal(sv_points);
1850 points=(U8**)SvPV_nolen(sv_points );
1851 if ( trie_type != trie_utf8_fold
1852 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1853 {
1854 if (trie->bitmap)
1855 bitmap=(U8*)trie->bitmap;
1856 else
1857 bitmap=(U8*)ANYOF_BITMAP(c);
1858 }
1859 /* this is the Aho-Corasick algorithm modified a touch
1860 to include special handling for long "unknown char" sequences.
1861 The basic idea being that we use AC as long as we are dealing
1862 with a possible matching char, when we encounter an unknown char
1863 (and we have not encountered an accepting state) we scan forward
1864 until we find a legal starting char.
1865 AC matching is basically that of trie matching, except that when
1866 we encounter a failing transition, we fall back to the current
1867 states "fail state", and try the current char again, a process
1868 we repeat until we reach the root state, state 1, or a legal
1869 transition. If we fail on the root state then we can either
1870 terminate if we have reached an accepting state previously, or
1871 restart the entire process from the beginning if we have not.
1872
1873 */
1874 while (s <= last_start) {
1875 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1876 U8 *uc = (U8*)s;
1877 U16 charid = 0;
1878 U32 base = 1;
1879 U32 state = 1;
1880 UV uvc = 0;
1881 STRLEN len = 0;
1882 STRLEN foldlen = 0;
1883 U8 *uscan = (U8*)NULL;
1884 U8 *leftmost = NULL;
1885#ifdef DEBUGGING
1886 U32 accepted_word= 0;
786e8c11 1887#endif
73104a1b
KW
1888 U32 pointpos = 0;
1889
1890 while ( state && uc <= (U8*)strend ) {
1891 int failed=0;
1892 U32 word = aho->states[ state ].wordnum;
1893
1894 if( state==1 ) {
1895 if ( bitmap ) {
1896 DEBUG_TRIE_EXECUTE_r(
1897 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1898 dump_exec_pos( (char *)uc, c, strend, real_start,
1899 (char *)uc, utf8_target );
1900 PerlIO_printf( Perl_debug_log,
1901 " Scanning for legal start char...\n");
1902 }
1903 );
1904 if (utf8_target) {
1905 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906 uc += UTF8SKIP(uc);
1907 }
1908 } else {
1909 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1910 uc++;
1911 }
786e8c11 1912 }
73104a1b 1913 s= (char *)uc;
07be1b83 1914 }
73104a1b
KW
1915 if (uc >(U8*)last_start) break;
1916 }
1917
1918 if ( word ) {
1919 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1920 if (!leftmost || lpos < leftmost) {
1921 DEBUG_r(accepted_word=word);
1922 leftmost= lpos;
7016d6eb 1923 }
73104a1b 1924 if (base==0) break;
7016d6eb 1925
73104a1b
KW
1926 }
1927 points[pointpos++ % maxlen]= uc;
1928 if (foldlen || uc < (U8*)strend) {
1929 REXEC_TRIE_READ_CHAR(trie_type, trie,
1930 widecharmap, uc,
1931 uscan, len, uvc, charid, foldlen,
1932 foldbuf, uniflags);
1933 DEBUG_TRIE_EXECUTE_r({
1934 dump_exec_pos( (char *)uc, c, strend,
1935 real_start, s, utf8_target);
1936 PerlIO_printf(Perl_debug_log,
1937 " Charid:%3u CP:%4"UVxf" ",
1938 charid, uvc);
1939 });
1940 }
1941 else {
1942 len = 0;
1943 charid = 0;
1944 }
07be1b83 1945
73104a1b
KW
1946
1947 do {
6148ee25 1948#ifdef DEBUGGING
73104a1b 1949 word = aho->states[ state ].wordnum;
6148ee25 1950#endif
73104a1b
KW
1951 base = aho->states[ state ].trans.base;
1952
1953 DEBUG_TRIE_EXECUTE_r({
1954 if (failed)
1955 dump_exec_pos( (char *)uc, c, strend, real_start,
1956 s, utf8_target );
1957 PerlIO_printf( Perl_debug_log,
1958 "%sState: %4"UVxf", word=%"UVxf,
1959 failed ? " Fail transition to " : "",
1960 (UV)state, (UV)word);
1961 });
1962 if ( base ) {
1963 U32 tmp;
1964 I32 offset;
1965 if (charid &&
1966 ( ((offset = base + charid
1967 - 1 - trie->uniquecharcount)) >= 0)
1968 && ((U32)offset < trie->lasttrans)
1969 && trie->trans[offset].check == state
1970 && (tmp=trie->trans[offset].next))
1971 {
1972 DEBUG_TRIE_EXECUTE_r(
1973 PerlIO_printf( Perl_debug_log," - legal\n"));
1974 state = tmp;
1975 break;
07be1b83
YO
1976 }
1977 else {
786e8c11 1978 DEBUG_TRIE_EXECUTE_r(
73104a1b 1979 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 1980 failed = 1;
73104a1b 1981 state = aho->fail[state];
07be1b83 1982 }
07be1b83 1983 }
73104a1b
KW
1984 else {
1985 /* we must be accepting here */
1986 DEBUG_TRIE_EXECUTE_r(
1987 PerlIO_printf( Perl_debug_log," - accepting\n"));
1988 failed = 1;
1989 break;
786e8c11 1990 }
73104a1b
KW
1991 } while(state);
1992 uc += len;
1993 if (failed) {
1994 if (leftmost)
1995 break;
1996 if (!state) state = 1;
07be1b83 1997 }
73104a1b
KW
1998 }
1999 if ( aho->states[ state ].wordnum ) {
2000 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2001 if (!leftmost || lpos < leftmost) {
2002 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2003 leftmost = lpos;
07be1b83
YO
2004 }
2005 }
73104a1b
KW
2006 if (leftmost) {
2007 s = (char*)leftmost;
2008 DEBUG_TRIE_EXECUTE_r({
2009 PerlIO_printf(
2010 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2011 (UV)accepted_word, (IV)(s - real_start)
2012 );
2013 });
2014 if (!reginfo || regtry(reginfo, &s)) {
2015 FREETMPS;
2016 LEAVE;
2017 goto got_it;
2018 }
2019 s = HOPc(s,1);
2020 DEBUG_TRIE_EXECUTE_r({
2021 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2022 });
2023 } else {
2024 DEBUG_TRIE_EXECUTE_r(
2025 PerlIO_printf( Perl_debug_log,"No match.\n"));
2026 break;
2027 }
2028 }
2029 FREETMPS;
2030 LEAVE;
2031 }
2032 break;
2033 default:
2034 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2035 break;
2036 }
2037 return 0;
2038 got_it:
2039 return s;
6eb5f6b9
JH
2040}
2041
fae667d5 2042
6eb5f6b9
JH
2043/*
2044 - regexec_flags - match a regexp against a string
2045 */
2046I32
5aaab254 2047Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2048 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2049/* stringarg: the point in the string at which to begin matching */
2050/* strend: pointer to null at end of string */
2051/* strbeg: real beginning of string */
2052/* minend: end of match must be >= minend bytes after stringarg. */
2053/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2054 * itself is accessed via the pointers above */
2055/* data: May be used for some additional optimizations.
2056 Currently its only used, with a U32 cast, for transmitting
2057 the ganch offset when doing a /g match. This will change */
2058/* nosave: For optimizations. */
2059
6eb5f6b9 2060{
97aff369 2061 dVAR;
8d919b0a 2062 struct regexp *const prog = ReANY(rx);
5aaab254 2063 char *s;
eb578fdb 2064 regnode *c;
5aaab254 2065 char *startpos = stringarg;
6eb5f6b9
JH
2066 I32 minlen; /* must match at least this many chars */
2067 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2068 I32 end_shift = 0; /* Same for the end. */ /* CC */
2069 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2070 char *scream_olds = NULL;
f2ed9b32 2071 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2072 I32 multiline;
f8fc2ecf 2073 RXi_GET_DECL(prog,progi);
3b0527fe 2074 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2075 regexp_paren_pair *swap = NULL;
a3621e74
YO
2076 GET_RE_DEBUG_FLAGS_DECL;
2077
7918f24d 2078 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2079 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2080
2081 /* Be paranoid... */
2082 if (prog == NULL || startpos == NULL) {
2083 Perl_croak(aTHX_ "NULL regexp parameter");
2084 return 0;
2085 }
2086
bbe252da 2087 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2088 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2089
f2ed9b32 2090 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2091 DEBUG_EXECUTE_r(
f2ed9b32 2092 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2093 "Matching");
2094 );
bac06658 2095
6eb5f6b9 2096 minlen = prog->minlen;
1de06328
YO
2097
2098 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2100 "String too short [regexec_flags]...\n"));
2101 goto phooey;
1aa99e6b 2102 }
6eb5f6b9 2103
1de06328 2104
6eb5f6b9 2105 /* Check validity of program. */
f8fc2ecf 2106 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2107 Perl_croak(aTHX_ "corrupted regexp program");
2108 }
2109
272d35c9 2110 RX_MATCH_TAINTED_off(rx);
ed301438 2111 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2112 PL_reg_maxiter = 0;
2113
984e6dd1 2114 reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
39819bd9 2115 reginfo.warned = FALSE;
6eb5f6b9 2116 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2117 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2118 PL_bostr = strbeg;
3b0527fe 2119 reginfo.sv = sv;
6eb5f6b9
JH
2120
2121 /* Mark end of line for $ (and such) */
2122 PL_regeol = strend;
2123
2124 /* see how far we have to get to not match where we matched before */
3b0527fe 2125 reginfo.till = startpos+minend;
6eb5f6b9 2126
6eb5f6b9
JH
2127 /* If there is a "must appear" string, look for it. */
2128 s = startpos;
2129
bbe252da 2130 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2131 MAGIC *mg;
2c296965 2132 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2133 reginfo.ganch = startpos + prog->gofs;
2c296965 2134 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2135 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2136 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2137 && SvMAGIC(sv)
14befaf4
DM
2138 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2139 && mg->mg_len >= 0) {
3b0527fe 2140 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2141 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2142 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2143
bbe252da 2144 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2145 if (s > reginfo.ganch)
6eb5f6b9 2146 goto phooey;
58e23c8d 2147 s = reginfo.ganch - prog->gofs;
2c296965 2148 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2149 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2150 if (s < strbeg)
2151 goto phooey;
6eb5f6b9
JH
2152 }
2153 }
58e23c8d 2154 else if (data) {
70685ca0 2155 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2156 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2157 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2158
2159 } else { /* pos() not defined */
3b0527fe 2160 reginfo.ganch = strbeg;
2c296965
YO
2161 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2162 "GPOS: reginfo.ganch = strbeg\n"));
2163 }
6eb5f6b9 2164 }
288b8c02 2165 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2166 /* We have to be careful. If the previous successful match
2167 was from this regex we don't want a subsequent partially
2168 successful match to clobber the old results.
2169 So when we detect this possibility we add a swap buffer
d8da0584
KW
2170 to the re, and switch the buffer each match. If we fail,
2171 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2172 */
2173 swap = prog->offs;
2174 /* do we need a save destructor here for eval dies? */
2175 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2176 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2177 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2178 PTR2UV(prog),
2179 PTR2UV(swap),
2180 PTR2UV(prog->offs)
2181 ));
c74340f9 2182 }
a0714e2c 2183 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2184 re_scream_pos_data d;
2185
2186 d.scream_olds = &scream_olds;
2187 d.scream_pos = &scream_pos;
288b8c02 2188 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2189 if (!s) {
a3621e74 2190 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2191 goto phooey; /* not present */
3fa9c3d7 2192 }
6eb5f6b9
JH
2193 }
2194
1de06328 2195
6eb5f6b9
JH
2196
2197 /* Simplest case: anchored match need be tried only once. */
2198 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2199 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2200 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2201 goto got_it;
bbe252da
YO
2202 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2203 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2204 {
2205 char *end;
2206
2207 if (minlen)
2208 dontbother = minlen - 1;
1aa99e6b 2209 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2210 /* for multiline we only have to try after newlines */
33b8afdf 2211 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2212 /* because of the goto we can not easily reuse the macros for bifurcating the
2213 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2214 if (utf8_target) {
2215 if (s == startpos)
2216 goto after_try_utf8;
2217 while (1) {
2218 if (regtry(&reginfo, &s)) {
2219 goto got_it;
2220 }
2221 after_try_utf8:
2222 if (s > end) {
2223 goto phooey;
2224 }
2225 if (prog->extflags & RXf_USE_INTUIT) {
2226 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2227 if (!s) {
2228 goto phooey;
2229 }
2230 }
2231 else {
2232 s += UTF8SKIP(s);
2233 }
2234 }
2235 } /* end search for check string in unicode */
2236 else {
2237 if (s == startpos) {
2238 goto after_try_latin;
2239 }
2240 while (1) {
2241 if (regtry(&reginfo, &s)) {
2242 goto got_it;
2243 }
2244 after_try_latin:
2245 if (s > end) {
2246 goto phooey;
2247 }
2248 if (prog->extflags & RXf_USE_INTUIT) {
2249 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2250 if (!s) {
2251 goto phooey;
2252 }
2253 }
2254 else {
2255 s++;
2256 }
2257 }
2258 } /* end search for check string in latin*/
2259 } /* end search for check string */
2260 else { /* search for newline */
2261 if (s > startpos) {
2262 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2263 s--;
92f3d482 2264 }
21eede78
YO
2265 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2266 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2267 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2268 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2269 goto got_it;
2270 }
92f3d482
YO
2271 }
2272 } /* end search for newline */
2273 } /* end anchored/multiline check string search */
6eb5f6b9 2274 goto phooey;
bbe252da 2275 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2276 {
486ec47a 2277 /* the warning about reginfo.ganch being used without initialization
bbe252da 2278 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2279 and we only enter this block when the same bit is set. */
58e23c8d 2280 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2281
2282 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2283 goto got_it;
2284 goto phooey;
2285 }
2286
2287 /* Messy cases: unanchored match. */
bbe252da 2288 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2289 /* we have /x+whatever/ */
984e6dd1 2290 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2291 char ch;
bf93d4cc
GS
2292#ifdef DEBUGGING
2293 int did_match = 0;
2294#endif
f2ed9b32 2295 if (utf8_target) {
7e0d5ad7
KW
2296 if (! prog->anchored_utf8) {
2297 to_utf8_substr(prog);
2298 }
2299 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2300 REXEC_FBC_SCAN(
6eb5f6b9 2301 if (*s == ch) {
a3621e74 2302 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2303 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2304 s += UTF8SKIP(s);
2305 while (s < strend && *s == ch)
2306 s += UTF8SKIP(s);
2307 }
4cadc6a9 2308 );
7e0d5ad7 2309
6eb5f6b9
JH
2310 }
2311 else {
7e0d5ad7
KW
2312 if (! prog->anchored_substr) {
2313 if (! to_byte_substr(prog)) {
6b54ddc5 2314 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2315 }
2316 }
2317 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2318 REXEC_FBC_SCAN(
6eb5f6b9 2319 if (*s == ch) {
a3621e74 2320 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2321 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2322 s++;
2323 while (s < strend && *s == ch)
2324 s++;
2325 }
4cadc6a9 2326 );
6eb5f6b9 2327 }
a3621e74 2328 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2329 PerlIO_printf(Perl_debug_log,
b7953727
JH
2330 "Did not find anchored character...\n")
2331 );
6eb5f6b9 2332 }
a0714e2c
SS
2333 else if (prog->anchored_substr != NULL
2334 || prog->anchored_utf8 != NULL
2335 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2336 && prog->float_max_offset < strend - s)) {
2337 SV *must;
2338 I32 back_max;
2339 I32 back_min;
2340 char *last;
6eb5f6b9 2341 char *last1; /* Last position checked before */
bf93d4cc
GS
2342#ifdef DEBUGGING
2343 int did_match = 0;
2344#endif
33b8afdf 2345 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2346 if (utf8_target) {
2347 if (! prog->anchored_utf8) {
2348 to_utf8_substr(prog);
2349 }
2350 must = prog->anchored_utf8;
2351 }
2352 else {
2353 if (! prog->anchored_substr) {
2354 if (! to_byte_substr(prog)) {
6b54ddc5 2355 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2356 }
2357 }
2358 must = prog->anchored_substr;
2359 }
33b8afdf
JH
2360 back_max = back_min = prog->anchored_offset;
2361 } else {
7e0d5ad7
KW
2362 if (utf8_target) {
2363 if (! prog->float_utf8) {
2364 to_utf8_substr(prog);
2365 }
2366 must = prog->float_utf8;
2367 }
2368 else {
2369 if (! prog->float_substr) {
2370 if (! to_byte_substr(prog)) {
6b54ddc5 2371 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2372 }
2373 }
2374 must = prog->float_substr;
2375 }
33b8afdf
JH
2376 back_max = prog->float_max_offset;
2377 back_min = prog->float_min_offset;
2378 }
1de06328 2379
1de06328
YO
2380 if (back_min<0) {
2381 last = strend;
2382 } else {
2383 last = HOP3c(strend, /* Cannot start after this */
2384 -(I32)(CHR_SVLEN(must)
2385 - (SvTAIL(must) != 0) + back_min), strbeg);
2386 }
6eb5f6b9
JH
2387 if (s > PL_bostr)
2388 last1 = HOPc(s, -1);
2389 else
2390 last1 = s - 1; /* bogus */
2391
a0288114 2392 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2393 check_substr==must. */
2394 scream_pos = -1;
2395 dontbother = end_shift;
2396 strend = HOPc(strend, -dontbother);
2397 while ( (s <= last) &&
c33e64f0 2398 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2399 (unsigned char*)strend, must,
c33e64f0 2400 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2401 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2402 if (HOPc(s, -back_max) > last1) {
2403 last1 = HOPc(s, -back_min);
2404 s = HOPc(s, -back_max);
2405 }
2406 else {
52657f30 2407 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2408
2409 last1 = HOPc(s, -back_min);
52657f30 2410 s = t;
6eb5f6b9 2411 }
f2ed9b32 2412 if (utf8_target) {
6eb5f6b9 2413 while (s <= last1) {
24b23f37 2414 if (regtry(&reginfo, &s))
6eb5f6b9 2415 goto got_it;
7016d6eb
DM
2416 if (s >= last1) {
2417 s++; /* to break out of outer loop */
2418 break;
2419 }
2420 s += UTF8SKIP(s);
6eb5f6b9
JH
2421 }
2422 }
2423 else {
2424 while (s <= last1) {
24b23f37 2425 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2426 goto got_it;
2427 s++;
2428 }
2429 }
2430 }
ab3bbdeb 2431 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2432 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2433 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2434 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2435 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2436 ? "anchored" : "floating"),
ab3bbdeb
YO
2437 quoted, RE_SV_TAIL(must));
2438 });
6eb5f6b9
JH
2439 goto phooey;
2440 }
f8fc2ecf 2441 else if ( (c = progi->regstclass) ) {
f14c76ed 2442 if (minlen) {
f8fc2ecf 2443 const OPCODE op = OP(progi->regstclass);
66e933ab 2444 /* don't bother with what can't match */
786e8c11 2445 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2446 strend = HOPc(strend, -(minlen - 1));
2447 }
a3621e74 2448 DEBUG_EXECUTE_r({
be8e71aa 2449 SV * const prop = sv_newmortal();
32fc9b6a 2450 regprop(prog, prop, c);
0df25f3d 2451 {
f2ed9b32 2452 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2453 s,strend-s,60);
0df25f3d 2454 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2455 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2456 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2457 quoted, (int)(strend - s));
0df25f3d 2458 }
ffc61ed2 2459 });
984e6dd1 2460 if (find_byclass(prog, c, s, strend, &reginfo, reginfo.is_utf8_pat))
6eb5f6b9 2461 goto got_it;
07be1b83 2462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2463 }
2464 else {
2465 dontbother = 0;
a0714e2c 2466 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2467 /* Trim the end. */
6af40bd7 2468 char *last= NULL;
33b8afdf 2469 SV* float_real;
c33e64f0
FC
2470 STRLEN len;
2471 const char *little;
33b8afdf 2472
7e0d5ad7
KW
2473 if (utf8_target) {
2474 if (! prog->float_utf8) {
2475 to_utf8_substr(prog);
2476 }
2477 float_real = prog->float_utf8;
2478 }
2479 else {
2480 if (! prog->float_substr) {
2481 if (! to_byte_substr(prog)) {
6b54ddc5 2482 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2483 }
2484 }
2485 float_real = prog->float_substr;
2486 }
d6a28714 2487
c33e64f0
FC
2488 little = SvPV_const(float_real, len);
2489 if (SvTAIL(float_real)) {
7f18ad16
KW
2490 /* This means that float_real contains an artificial \n on
2491 * the end due to the presence of something like this:
2492 * /foo$/ where we can match both "foo" and "foo\n" at the
2493 * end of the string. So we have to compare the end of the
2494 * string first against the float_real without the \n and
2495 * then against the full float_real with the string. We
2496 * have to watch out for cases where the string might be
2497 * smaller than the float_real or the float_real without
2498 * the \n. */
1a13b075
YO
2499 char *checkpos= strend - len;
2500 DEBUG_OPTIMISE_r(
2501 PerlIO_printf(Perl_debug_log,
2502 "%sChecking for float_real.%s\n",
2503 PL_colors[4], PL_colors[5]));
2504 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2505 /* can't match, even if we remove the trailing \n
2506 * string is too short to match */
1a13b075
YO
2507 DEBUG_EXECUTE_r(
2508 PerlIO_printf(Perl_debug_log,
2509 "%sString shorter than required trailing substring, cannot match.%s\n",
2510 PL_colors[4], PL_colors[5]));
2511 goto phooey;
2512 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2513 /* can match, the end of the string matches without the
2514 * "\n" */
1a13b075
YO
2515 last = checkpos + 1;
2516 } else if (checkpos < strbeg) {
7f18ad16
KW
2517 /* cant match, string is too short when the "\n" is
2518 * included */
1a13b075
YO
2519 DEBUG_EXECUTE_r(
2520 PerlIO_printf(Perl_debug_log,
2521 "%sString does not contain required trailing substring, cannot match.%s\n",
2522 PL_colors[4], PL_colors[5]));
2523 goto phooey;
2524 } else if (!multiline) {
7f18ad16
KW
2525 /* non multiline match, so compare with the "\n" at the
2526 * end of the string */
1a13b075
YO
2527 if (memEQ(checkpos, little, len)) {
2528 last= checkpos;
2529 } else {
2530 DEBUG_EXECUTE_r(
2531 PerlIO_printf(Perl_debug_log,
2532 "%sString does not contain required trailing substring, cannot match.%s\n",
2533 PL_colors[4], PL_colors[5]));
2534 goto phooey;
2535 }
2536 } else {
7f18ad16
KW
2537 /* multiline match, so we have to search for a place
2538 * where the full string is located */
d6a28714 2539 goto find_last;
1a13b075 2540 }
c33e64f0 2541 } else {
d6a28714 2542 find_last:
9041c2e3 2543 if (len)
d6a28714 2544 last = rninstr(s, strend, little, little + len);
b8c5462f 2545 else
a0288114 2546 last = strend; /* matching "$" */
b8c5462f 2547 }
6af40bd7 2548 if (!last) {
7f18ad16
KW
2549 /* at one point this block contained a comment which was
2550 * probably incorrect, which said that this was a "should not
2551 * happen" case. Even if it was true when it was written I am
2552 * pretty sure it is not anymore, so I have removed the comment
2553 * and replaced it with this one. Yves */
6bda09f9
YO
2554 DEBUG_EXECUTE_r(
2555 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2556 "String does not contain required substring, cannot match.\n"
2557 ));
2558 goto phooey;
bf93d4cc 2559 }
d6a28714
JH
2560 dontbother = strend - last + prog->float_min_offset;
2561 }
2562 if (minlen && (dontbother < minlen))
2563 dontbother = minlen - 1;
2564 strend -= dontbother; /* this one's always in bytes! */
2565 /* We don't know much -- general case. */
f2ed9b32 2566 if (utf8_target) {
d6a28714 2567 for (;;) {
24b23f37 2568 if (regtry(&reginfo, &s))
d6a28714
JH
2569 goto got_it;
2570 if (s >= strend)
2571 break;
b8c5462f 2572 s += UTF8SKIP(s);
d6a28714
JH
2573 };
2574 }
2575 else {
2576 do {
24b23f37 2577 if (regtry(&reginfo, &s))
d6a28714
JH
2578 goto got_it;
2579 } while (s++ < strend);
2580 }
2581 }
2582
2583 /* Failure. */
2584 goto phooey;
2585
2586got_it:
495f47a5
DM
2587 DEBUG_BUFFERS_r(
2588 if (swap)
2589 PerlIO_printf(Perl_debug_log,
2590 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2591 PTR2UV(prog),
2592 PTR2UV(swap)
2593 );
2594 );
e9105d30 2595 Safefree(swap);
d6a28714 2596
ed301438 2597 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2598 restore_pos(aTHX_ prog);
5daac39c
NC
2599 if (RXp_PAREN_NAMES(prog))
2600 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2601
2602 /* make sure $`, $&, $', and $digit will work later */
2603 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2604 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2605#ifdef PERL_ANY_COW
2606 if (SvCANCOW(sv)) {
ed252734
NC
2607 if (DEBUG_C_TEST) {
2608 PerlIO_printf(Perl_debug_log,
2609 "Copy on write: regexp capture, type %d\n",
2610 (int) SvTYPE(sv));
2611 }
77f8f7c1 2612 RX_MATCH_COPY_FREE(rx);
ed252734 2613 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2614 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2615 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2616 prog->sublen = PL_regeol - strbeg;
2617 prog->suboffset = 0;
2618 prog->subcoffset = 0;
ed252734
NC
2619 } else
2620#endif
2621 {
6502e081
DM
2622 I32 min = 0;
2623 I32 max = PL_regeol - strbeg;
2624 I32 sublen;
2625
2626 if ( (flags & REXEC_COPY_SKIP_POST)
2627 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2628 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2629 ) { /* don't copy $' part of string */
3de645a8 2630 U32 n = 0;
6502e081
DM
2631 max = -1;
2632 /* calculate the right-most part of the string covered
2633 * by a capture. Due to look-ahead, this may be to
2634 * the right of $&, so we have to scan all captures */
2635 while (n <= prog->lastparen) {
2636 if (prog->offs[n].end > max)
2637 max = prog->offs[n].end;
2638 n++;
2639 }
2640 if (max == -1)
2641 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2642 ? prog->offs[0].start
2643 : 0;
2644 assert(max >= 0 && max <= PL_regeol - strbeg);
2645 }
2646
2647 if ( (flags & REXEC_COPY_SKIP_PRE)
2648 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2649 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2650 ) { /* don't copy $` part of string */
3de645a8 2651 U32 n = 0;
6502e081
DM
2652 min = max;
2653 /* calculate the left-most part of the string covered
2654 * by a capture. Due to look-behind, this may be to
2655 * the left of $&, so we have to scan all captures */
2656 while (min && n <= prog->lastparen) {
2657 if ( prog->offs[n].start != -1
2658 && prog->offs[n].start < min)
2659 {
2660 min = prog->offs[n].start;
2661 }
2662 n++;
2663 }
2664 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2665 && min > prog->offs[0].end
2666 )
2667 min = prog->offs[0].end;
2668
2669 }
2670
2671 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2672 sublen = max - min;
2673
2674 if (RX_MATCH_COPIED(rx)) {
2675 if (sublen > prog->sublen)
2676 prog->subbeg =
2677 (char*)saferealloc(prog->subbeg, sublen+1);
2678 }
2679 else
2680 prog->subbeg = (char*)safemalloc(sublen+1);
2681 Copy(strbeg + min, prog->subbeg, sublen, char);
2682 prog->subbeg[sublen] = '\0';
2683 prog->suboffset = min;
2684 prog->sublen = sublen;
77f8f7c1 2685 RX_MATCH_COPIED_on(rx);
6502e081 2686 }
6502e081
DM
2687 prog->subcoffset = prog->suboffset;
2688 if (prog->suboffset && utf8_target) {
2689 /* Convert byte offset to chars.
2690 * XXX ideally should only compute this if @-/@+
2691 * has been seen, a la PL_sawampersand ??? */
2692
2693 /* If there's a direct correspondence between the
2694 * string which we're matching and the original SV,
2695 * then we can use the utf8 len cache associated with
2696 * the SV. In particular, it means that under //g,
2697 * sv_pos_b2u() will use the previously cached
2698 * position to speed up working out the new length of
2699 * subcoffset, rather than counting from the start of
2700 * the string each time. This stops
2701 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2702 * from going quadratic */
2703 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2704 sv_pos_b2u(sv, &(prog->subcoffset));
2705 else
2706 prog->subcoffset = utf8_length((U8*)strbeg,
2707 (U8*)(strbeg+prog->suboffset));
2708 }
d6a28714
JH
2709 }
2710 else {
6502e081 2711 RX_MATCH_COPY_FREE(rx);
d6a28714 2712 prog->subbeg = strbeg;
6502e081
DM
2713 prog->suboffset = 0;
2714 prog->subcoffset = 0;
d6a28714
JH
2715 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2716 }
2717 }
9041c2e3 2718
d6a28714
JH
2719 return 1;
2720
2721phooey:
a3621e74 2722 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2723 PL_colors[4], PL_colors[5]));
ed301438 2724 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2725 restore_pos(aTHX_ prog);
e9105d30 2726 if (swap) {
c74340f9 2727 /* we failed :-( roll it back */
495f47a5
DM
2728 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2729 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2730 PTR2UV(prog),
2731 PTR2UV(prog->offs),
2732 PTR2UV(swap)
2733 ));
e9105d30
GG
2734 Safefree(prog->offs);
2735 prog->offs = swap;
2736 }
d6a28714
JH
2737 return 0;
2738}
2739
6bda09f9 2740
ec43f78b
DM
2741/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2742 * Do inc before dec, in case old and new rex are the same */
2743#define SET_reg_curpm(Re2) \
2744 if (PL_reg_state.re_state_eval_setup_done) { \
2745 (void)ReREFCNT_inc(Re2); \
2746 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2747 PM_SETRE((PL_reg_curpm), (Re2)); \
2748 }
2749
2750
d6a28714
JH
2751/*
2752 - regtry - try match at specific point
2753 */
2754STATIC I32 /* 0 failure, 1 success */
f73aaa43 2755S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2756{
97aff369 2757 dVAR;
d6a28714 2758 CHECKPOINT lastcp;
288b8c02 2759 REGEXP *const rx = reginfo->prog;
8d919b0a 2760 regexp *const prog = ReANY(rx);
f73aaa43 2761 I32 result;
f8fc2ecf 2762 RXi_GET_DECL(prog,progi);
a3621e74 2763 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2764
2765 PERL_ARGS_ASSERT_REGTRY;
2766
24b23f37 2767 reginfo->cutpoint=NULL;
d6a28714 2768
ed301438
DM
2769 if ((prog->extflags & RXf_EVAL_SEEN)
2770 && !PL_reg_state.re_state_eval_setup_done)
2771 {
d6a28714
JH
2772 MAGIC *mg;
2773
ed301438 2774 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2775 if (reginfo->sv) {
d6a28714 2776 /* Make $_ available to executed code. */
3b0527fe 2777 if (reginfo->sv != DEFSV) {
59f00321 2778 SAVE_DEFSV;
414bf5ae 2779 DEFSV_set(reginfo->sv);
b8c5462f 2780 }
d6a28714 2781
3b0527fe
DM
2782 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2783 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2784 /* prepare for quick setting of pos */
d300d9fa 2785#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2786 if (SvIsCOW(reginfo->sv))
2787 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2788#endif
3dab1dad 2789 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2790 &PL_vtbl_mglob, NULL, 0);
d6a28714 2791 mg->mg_len = -1;
b8c5462f 2792 }
d6a28714
JH
2793 PL_reg_magic = mg;
2794 PL_reg_oldpos = mg->mg_len;
4f639d21 2795 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2796 }
09687e5a 2797 if (!PL_reg_curpm) {
a02a5408 2798 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2799#ifdef USE_ITHREADS
2800 {
14a49a24 2801 SV* const repointer = &PL_sv_undef;
92313705
NC
2802 /* this regexp is also owned by the new PL_reg_curpm, which
2803 will try to free it. */
d2ece331 2804 av_push(PL_regex_padav, repointer);
09687e5a
AB
2805 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2806 PL_regex_pad = AvARRAY(PL_regex_padav);
2807 }
2808#endif
2809 }
ec43f78b 2810 SET_reg_curpm(rx);
d6a28714
JH
2811 PL_reg_oldcurpm = PL_curpm;
2812 PL_curpm = PL_reg_curpm;
07bc277f 2813 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2814 /* Here is a serious problem: we cannot rewrite subbeg,
2815 since it may be needed if this match fails. Thus
2816 $` inside (?{}) could fail... */
2817 PL_reg_oldsaved = prog->subbeg;
2818 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2819 PL_reg_oldsavedoffset = prog->suboffset;
2820 PL_reg_oldsavedcoffset = prog->suboffset;
db2c6cb3 2821#ifdef PERL_ANY_COW
ed252734
NC
2822 PL_nrs = prog->saved_copy;
2823#endif
07bc277f 2824 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2825 }
2826 else
bd61b366 2827 PL_reg_oldsaved = NULL;
d6a28714 2828 prog->subbeg = PL_bostr;
6502e081
DM
2829 prog->suboffset = 0;
2830 prog->subcoffset = 0;
d6a28714
JH
2831 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2832 }
97ca13b7 2833#ifdef DEBUGGING
f73aaa43 2834 PL_reg_starttry = *startposp;
97ca13b7 2835#endif
f73aaa43 2836 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2837 prog->lastparen = 0;
03994de8 2838 prog->lastcloseparen = 0;
d6a28714
JH
2839
2840 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2841 to do this again and again, prog->lastparen should take care of
3dd2943c 2842 this! --ilya*/
dafc8851
JH
2843
2844 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2845 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2846 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2847 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2848 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2849 * Meanwhile, this code *is* needed for the
daf18116
JH
2850 * above-mentioned test suite tests to succeed. The common theme
2851 * on those tests seems to be returning null fields from matches.
225593e1 2852 * --jhi updated by dapm */
dafc8851 2853#if 1
d6a28714 2854 if (prog->nparens) {
b93070ed 2855 regexp_paren_pair *pp = prog->offs;
eb578fdb 2856 I32 i;
b93070ed 2857 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2858 ++pp;
2859 pp->start = -1;
2860 pp->end = -1;
d6a28714
JH
2861 }
2862 }
dafc8851 2863#endif
02db2b7b 2864 REGCP_SET(lastcp);
f73aaa43
DM
2865 result = regmatch(reginfo, *startposp, progi->program + 1);
2866 if (result != -1) {
2867 prog->offs[0].end = result;
d6a28714
JH
2868 return 1;
2869 }
24b23f37 2870 if (reginfo->cutpoint)
f73aaa43 2871 *startposp= reginfo->cutpoint;
02db2b7b 2872 REGCP_UNWIND(lastcp);
d6a28714
JH
2873 return 0;
2874}
2875
02db2b7b 2876
8ba1375e
MJD
2877#define sayYES goto yes
2878#define sayNO goto no
262b90c4 2879#define sayNO_SILENT goto no_silent
8ba1375e 2880
f9f4320a
YO
2881/* we dont use STMT_START/END here because it leads to
2882 "unreachable code" warnings, which are bogus, but distracting. */
2883#define CACHEsayNO \
c476f425
DM
2884 if (ST.cache_mask) \
2885 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2886 sayNO
3298f257 2887
a3621e74 2888/* this is used to determine how far from the left messages like
265c4333
YO
2889 'failed...' are printed. It should be set such that messages
2890 are inline with the regop output that created them.
a3621e74 2891*/
265c4333 2892#define REPORT_CODE_OFF 32
a3621e74
YO
2893
2894
40a82448
DM
2895#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2896#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2897#define CHRTEST_NOT_A_CP_1 -999
2898#define CHRTEST_NOT_A_CP_2 -998
9e137952 2899
86545054
DM
2900#define SLAB_FIRST(s) (&(s)->states[0])
2901#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2902
5d9a96ca
DM
2903/* grab a new slab and return the first slot in it */
2904
2905STATIC regmatch_state *
2906S_push_slab(pTHX)
2907{
a35a87e7 2908#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2909 dMY_CXT;
2910#endif
5d9a96ca
DM
2911 regmatch_slab *s = PL_regmatch_slab->next;
2912 if (!s) {
2913 Newx(s, 1, regmatch_slab);
2914 s->prev = PL_regmatch_slab;
2915 s->next = NULL;
2916 PL_regmatch_slab->next = s;
2917 }
2918 PL_regmatch_slab = s;
86545054 2919 return SLAB_FIRST(s);
5d9a96ca 2920}
5b47454d 2921
95b24440 2922
40a82448
DM
2923/* push a new state then goto it */
2924
4d5016e5
DM
2925#define PUSH_STATE_GOTO(state, node, input) \
2926 pushinput = input; \
40a82448
DM
2927 scan = node; \
2928 st->resume_state = state; \
2929 goto push_state;
2930
2931/* push a new state with success backtracking, then goto it */
2932
4d5016e5
DM
2933#define PUSH_YES_STATE_GOTO(state, node, input) \
2934 pushinput = input; \
40a82448
DM
2935 scan = node; \
2936 st->resume_state = state; \
2937 goto push_yes_state;
2938
aa283a38 2939
aa283a38 2940
4d5016e5 2941
d6a28714 2942/*
95b24440 2943
bf1f174e
DM
2944regmatch() - main matching routine
2945
2946This is basically one big switch statement in a loop. We execute an op,
2947set 'next' to point the next op, and continue. If we come to a point which
2948we may need to backtrack to on failure such as (A|B|C), we push a
2949backtrack state onto the backtrack stack. On failure, we pop the top
2950state, and re-enter the loop at the state indicated. If there are no more
2951states to pop, we return failure.
2952
2953Sometimes we also need to backtrack on success; for example /A+/, where
2954after successfully matching one A, we need to go back and try to
2955match another one; similarly for lookahead assertions: if the assertion
2956completes successfully, we backtrack to the state just before the assertion
2957and then carry on. In these cases, the pushed state is marked as
2958'backtrack on success too'. This marking is in fact done by a chain of
2959pointers, each pointing to the previous 'yes' state. On success, we pop to
2960the nearest yes state, discarding any intermediate failure-only states.
2961Sometimes a yes state is pushed just to force some cleanup code to be
2962called at the end of a successful match or submatch; e.g. (??{$re}) uses
2963it to free the inner regex.
2964
2965Note that failure backtracking rewinds the cursor position, while
2966success backtracking leaves it alone.
2967
2968A pattern is complete when the END op is executed, while a subpattern
2969such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2970ops trigger the "pop to last yes state if any, otherwise return true"
2971behaviour.
2972
2973A common convention in this function is to use A and B to refer to the two
2974subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2975the subpattern to be matched possibly multiple times, while B is the entire
2976rest of the pattern. Variable and state names reflect this convention.
2977
2978The states in the main switch are the union of ops and failure/success of
2979substates associated with with that op. For example, IFMATCH is the op
2980that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2981'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2982successfully matched A and IFMATCH_A_fail is a state saying that we have
2983just failed to match A. Resume states always come in pairs. The backtrack
2984state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2985at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2986on success or failure.
2987
2988The struct that holds a backtracking state is actually a big union, with
2989one variant for each major type of op. The variable st points to the
2990top-most backtrack struct. To make the code clearer, within each
2991block of code we #define ST to alias the relevant union.
2992
2993Here's a concrete example of a (vastly oversimplified) IFMATCH
2994implementation:
2995
2996 switch (state) {
2997 ....
2998
2999#define ST st->u.ifmatch
3000
3001 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3002 ST.foo = ...; // some state we wish to save
95b24440 3003 ...
bf1f174e
DM
3004 // push a yes backtrack state with a resume value of
3005 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3006 // first node of A:
4d5016e5 3007 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3008 // NOTREACHED
3009
3010 case IFMATCH_A: // we have successfully executed A; now continue with B
3011 next = B;
3012 bar = ST.foo; // do something with the preserved value
3013 break;
3014
3015 case IFMATCH_A_fail: // A failed, so the assertion failed
3016 ...; // do some housekeeping, then ...
3017 sayNO; // propagate the failure
3018
3019#undef ST
95b24440 3020
bf1f174e
DM
3021 ...
3022 }
95b24440 3023
bf1f174e
DM
3024For any old-timers reading this who are familiar with the old recursive
3025approach, the code above is equivalent to:
95b24440 3026
bf1f174e
DM
3027 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3028 {
3029 int foo = ...
95b24440 3030 ...
bf1f174e
DM
3031 if (regmatch(A)) {
3032 next = B;
3033 bar = foo;
3034 break;
95b24440 3035 }
bf1f174e
DM
3036 ...; // do some housekeeping, then ...
3037 sayNO; // propagate the failure
95b24440 3038 }
bf1f174e
DM
3039
3040The topmost backtrack state, pointed to by st, is usually free. If you
3041want to claim it, populate any ST.foo fields in it with values you wish to
3042save, then do one of
3043
4d5016e5
DM
3044 PUSH_STATE_GOTO(resume_state, node, newinput);
3045 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3046
3047which sets that backtrack state's resume value to 'resume_state', pushes a
3048new free entry to the top of the backtrack stack, then goes to 'node'.
3049On backtracking, the free slot is popped, and the saved state becomes the
3050new free state. An ST.foo field in this new top state can be temporarily
3051accessed to retrieve values, but once the main loop is re-entered, it
3052becomes available for reuse.
3053
3054Note that the depth of the backtrack stack constantly increases during the
3055left-to-right execution of the pattern, rather than going up and down with
3056the pattern nesting. For example the stack is at its maximum at Z at the
3057end of the pattern, rather than at X in the following:
3058
3059 /(((X)+)+)+....(Y)+....Z/
3060
3061The only exceptions to this are lookahead/behind assertions and the cut,
3062(?>A), which pop all the backtrack states associated with A before
3063continuing.
3064
486ec47a 3065Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3066PL_regmatch_state and st always point to the currently active state,
3067and PL_regmatch_slab points to the slab currently containing
3068PL_regmatch_state. The first time regmatch() is called, the first slab is
3069allocated, and is never freed until interpreter destruction. When the slab
3070is full, a new one is allocated and chained to the end. At exit from
3071regmatch(), slabs allocated since entry are freed.
3072
3073*/
95b24440 3074
40a82448 3075
5bc10b2c 3076#define DEBUG_STATE_pp(pp) \
265c4333 3077 DEBUG_STATE_r({ \
f2ed9b32 3078 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3079 PerlIO_printf(Perl_debug_log, \
5d458dd8 3080 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3081 depth*2, "", \
13d6edb4 3082 PL_reg_name[st->resume_state], \
5d458dd8
YO
3083 ((st==yes_state||st==mark_state) ? "[" : ""), \
3084 ((st==yes_state) ? "Y" : ""), \
3085 ((st==mark_state) ? "M" : ""), \
3086 ((st==yes_state||st==mark_state) ? "]" : "") \
3087 ); \
265c4333 3088 });
5bc10b2c 3089
40a82448 3090
3dab1dad 3091#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3092
3df15adc 3093#ifdef DEBUGGING
5bc10b2c 3094
ab3bbdeb 3095STATIC void
f2ed9b32 3096S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3097 const char *start, const char *end, const char *blurb)
3098{
efd26800 3099 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3100
3101 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3102
ab3bbdeb
YO
3103 if (!PL_colorset)
3104 reginitcolors();
3105 {
3106 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3107 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3108
f2ed9b32 3109 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3110 start, end - start, 60);
3111
3112 PerlIO_printf(Perl_debug_log,
3113 "%s%s REx%s %s against %s\n",
3114 PL_colors[4], blurb, PL_colors[5], s0, s1);
3115
f2ed9b32 3116 if (utf8_target||utf8_pat)
1de06328
YO
3117 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3118 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3119 utf8_pat && utf8_target ? " and " : "",
3120 utf8_target ? "string" : ""
ab3bbdeb
YO
3121 );
3122 }
3123}
3df15adc
YO
3124
3125STATIC void
786e8c11
YO
3126S_dump_exec_pos(pTHX_ const char *locinput,
3127 const regnode *scan,
3128 const char *loc_regeol,
3129 const char *loc_bostr,
3130 const char *loc_reg_starttry,
f2ed9b32 3131 const bool utf8_target)
07be1b83 3132{
786e8c11 3133 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3134 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3135 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3136 /* The part of the string before starttry has one color
3137 (pref0_len chars), between starttry and current
3138 position another one (pref_len - pref0_len chars),
3139 after the current position the third one.
3140 We assume that pref0_len <= pref_len, otherwise we
3141 decrease pref0_len. */
786e8c11
YO
3142 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3143 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3144 int pref0_len;
3145
7918f24d
NC
3146 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3147
f2ed9b32 3148 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3149 pref_len++;
786e8c11
YO
3150 pref0_len = pref_len - (locinput - loc_reg_starttry);
3151 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3152 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3153 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3154 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3155 l--;
3156 if (pref0_len < 0)
3157 pref0_len = 0;
3158 if (pref0_len > pref_len)
3159 pref0_len = pref_len;
3160 {
f2ed9b32 3161 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3162
ab3bbdeb 3163 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3164 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3165
ab3bbdeb 3166 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3167 (locinput - pref_len + pref0_len),
1de06328 3168 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3169
ab3bbdeb 3170 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3171 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3172
1de06328 3173 const STRLEN tlen=len0+len1+len2;
3df15adc 3174 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3175 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3176 (IV)(locinput - loc_bostr),
07be1b83 3177 len0, s0,
07be1b83 3178 len1, s1,
07be1b83 3179 (docolor ? "" : "> <"),
07be1b83 3180 len2, s2,
f9f4320a 3181 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3182 "");
3183 }
3184}
3df15adc 3185
07be1b83
YO
3186#endif
3187
0a4db386
YO
3188/* reg_check_named_buff_matched()
3189 * Checks to see if a named buffer has matched. The data array of
3190 * buffer numbers corresponding to the buffer is expected to reside
3191 * in the regexp->data->data array in the slot stored in the ARG() of
3192 * node involved. Note that this routine doesn't actually care about the
3193 * name, that information is not preserved from compilation to execution.
3194 * Returns the index of the leftmost defined buffer with the given name
3195 * or 0 if non of the buffers matched.
3196 */
3197STATIC I32
7918f24d
NC
3198S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3199{
0a4db386 3200 I32 n;
f8fc2ecf 3201 RXi_GET_DECL(rex,rexi);
ad64d0ec 3202 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3203 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3204
3205 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3206
0a4db386 3207 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3208 if ((I32)rex->lastparen >= nums[n] &&
3209 rex->offs[nums[n]].end != -1)
0a4db386
YO
3210 {
3211 return nums[n];
3212 }
3213 }
3214 return 0;
3215}
3216
2f554ef7
DM
3217
3218/* free all slabs above current one - called during LEAVE_SCOPE */
3219
3220STATIC void
3221S_clear_backtrack_stack(pTHX_ void *p)
3222{
3223 regmatch_slab *s = PL_regmatch_slab->next;
3224 PERL_UNUSED_ARG(p);
3225
3226 if (!s)
3227 return;
3228 PL_regmatch_slab->next = NULL;
3229 while (s) {
3230 regmatch_slab * const osl = s;
3231 s = s->next;
3232 Safefree(osl);
3233 }
3234}
c74f6de9 3235static bool
984e6dd1
DM
3236S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3237 U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
c74f6de9 3238{
79a2a0e8
KW
3239 /* This function determines if there are one or two characters that match
3240 * the first character of the passed-in EXACTish node <text_node>, and if
3241 * so, returns them in the passed-in pointers.
c74f6de9 3242 *
79a2a0e8
KW
3243 * If it determines that no possible character in the target string can
3244 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3245 * the first character in <text_node> requires UTF-8 to represent, and the
3246 * target string isn't in UTF-8.)
c74f6de9 3247 *
79a2a0e8
KW
3248 * If there are more than two characters that could match the beginning of
3249 * <text_node>, or if more context is required to determine a match or not,
3250 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3251 *
3252 * The motiviation behind this function is to allow the caller to set up
3253 * tight loops for matching. If <text_node> is of type EXACT, there is
3254 * only one possible character that can match its first character, and so
3255 * the situation is quite simple. But things get much more complicated if
3256 * folding is involved. It may be that the first character of an EXACTFish
3257 * node doesn't participate in any possible fold, e.g., punctuation, so it
3258 * can be matched only by itself. The vast majority of characters that are
3259 * in folds match just two things, their lower and upper-case equivalents.
3260 * But not all are like that; some have multiple possible matches, or match
3261 * sequences of more than one character. This function sorts all that out.
3262 *
3263 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3264 * loop of trying to match A*, we know we can't exit where the thing
3265 * following it isn't a B. And something can't be a B unless it is the
3266 * beginning of B. By putting a quick test for that beginning in a tight
3267 * loop, we can rule out things that can't possibly be B without having to
3268 * break out of the loop, thus avoiding work. Similarly, if A is a single
3269 * character, we can make a tight loop matching A*, using the outputs of
3270 * this function.
3271 *
3272 * If the target string to match isn't in UTF-8, and there aren't
3273 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3274 * the one or two possible octets (which are characters in this situation)
3275 * that can match. In all cases, if there is only one character that can
3276 * match, *<c1p> and *<c2p> will be identical.
3277 *
3278 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3279 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3280 * can match the beginning of <text_node>. They should be declared with at
3281 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3282 * undefined what these contain.) If one or both of the buffers are
3283 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3284 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3285 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3286 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3287 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3288
3289 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3290
ddb0d839
KW
3291 UV c1 = CHRTEST_NOT_A_CP_1;
3292 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3293 bool use_chrtest_void = FALSE;
3294
3295 /* Used when we have both utf8 input and utf8 output, to avoid converting
3296 * to/from code points */
3297 bool utf8_has_been_setup = FALSE;
3298
c74f6de9
KW
3299 dVAR;
3300
b4291290 3301 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3302
79a2a0e8
KW
3303 if (OP(text_node) == EXACT) {
3304
3305 /* In an exact node, only one thing can be matched, that first
3306 * character. If both the pat and the target are UTF-8, we can just
3307 * copy the input to the output, avoiding finding the code point of
3308 * that character */
984e6dd1 3309 if (!is_utf8_pat) {
79a2a0e8
KW
3310 c2 = c1 = *pat;
3311 }
3312 else if (utf8_target) {
3313 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3314 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3315 utf8_has_been_setup = TRUE;
3316 }
3317 else {
3318 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3319 }
79a2a0e8
KW
3320 }
3321 else /* an EXACTFish node */
984e6dd1 3322 if ((is_utf8_pat
79a2a0e8
KW
3323 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3324 pat + STR_LEN(text_node)))
984e6dd1 3325 || (!is_utf8_pat
79a2a0e8
KW
3326 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3327 pat + STR_LEN(text_node))))
3328 {
3329 /* Multi-character folds require more context to sort out. Also
3330 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3331 * handled outside this routine */
3332 use_chrtest_void = TRUE;
3333 }
3334 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
984e6dd1 3335 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
79a2a0e8
KW
3336 if (c1 > 256) {
3337 /* Load the folds hash, if not already done */
3338 SV** listp;
3339 if (! PL_utf8_foldclosures) {
3340 if (! PL_utf8_tofold) {
3341 U8 dummy[UTF8_MAXBYTES+1];
3342
3343 /* Force loading this by folding an above-Latin1 char */
3344 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3345 assert(PL_utf8_tofold); /* Verify that worked */
3346 }
3347 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3348 }
3349
3350 /* The fold closures data structure is a hash with the keys being
3351 * the UTF-8 of every character that is folded to, like 'k', and
3352 * the values each an array of all code points that fold to its
3353 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3354 * not included */
3355 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3356 (char *) pat,
3357 UTF8SKIP(pat),
3358 FALSE))))
3359 {
3360 /* Not found in the hash, therefore there are no folds
3361 * containing it, so there is only a single character that
3362 * could match */
3363 c2 = c1;
3364 }
3365 else { /* Does participate in folds */
3366 AV* list = (AV*) *listp;
3367 if (av_len(list) != 1) {
3368
3369 /* If there aren't exactly two folds to this, it is outside
3370 * the scope of this function */
3371 use_chrtest_void = TRUE;
3372 }
3373 else { /* There are two. Get them */
3374 SV** c_p = av_fetch(list, 0, FALSE);
3375 if (c_p == NULL) {
3376 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3377 }
3378 c1 = SvUV(*c_p);
3379
3380 c_p = av_fetch(list, 1, FALSE);
3381 if (c_p == NULL) {
3382 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3383 }
3384 c2 = SvUV(*c_p);
3385
3386 /* Folds that cross the 255/256 boundary are forbidden if
3387 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3388 * pattern character is above 256, and its only other match
3389 * is below 256, the only legal match will be to itself.
3390 * We have thrown away the original, so have to compute
3391 * which is the one above 255 */
3392 if ((c1 < 256) != (c2 < 256)) {
3393 if (OP(text_node) == EXACTFL
3394 || (OP(text_node) == EXACTFA
3395 && (isASCII(c1) || isASCII(c2))))
3396 {
3397 if (c1 < 256) {
3398 c1 = c2;
3399 }
3400 else {
3401 c2 = c1;
3402 }
3403 }
3404 }
3405 }
3406 }
3407 }
3408 else /* Here, c1 is < 255 */
3409 if (utf8_target
3410 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3411 && OP(text_node) != EXACTFL
3412 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3413 {
3414 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3415 * folds to this character in the pattern. All such cases except
3416 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3417 * involved in their folds, so are outside the scope of this
3418 * function */
3419 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3420 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3421 }
3422 else {
3423 use_chrtest_void = TRUE;
3424 }
c74f6de9
KW
3425 }
3426 else { /* Here nothing above Latin1 can fold to the pattern character */
3427 switch (OP(text_node)) {
3428
3429 case EXACTFL: /* /l rules */
79a2a0e8 3430 c2 = PL_fold_locale[c1];
c74f6de9
KW
3431 break;
3432
3433 case EXACTF:
3434 if (! utf8_target) { /* /d rules */
79a2a0e8 3435 c2 = PL_fold[c1];
c74f6de9
KW
3436 break;
3437 }
3438 /* FALLTHROUGH */
3439 /* /u rules for all these. This happens to work for
79a2a0e8 3440 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3441 case EXACTFA:
3442 case EXACTFU_TRICKYFOLD:
79a2a0e8 3443 case EXACTFU_SS:
c74f6de9 3444 case EXACTFU:
79a2a0e8 3445 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3446 break;
3447
878531d3
KW
3448 default:
3449 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3450 assert(0); /* NOTREACHED */
c74f6de9
KW
3451 }
3452 }
3453 }
79a2a0e8
KW
3454
3455 /* Here have figured things out. Set up the returns */
3456 if (use_chrtest_void) {
3457 *c2p = *c1p = CHRTEST_VOID;
3458 }
3459 else if (utf8_target) {
3460 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3461 uvchr_to_utf8(c1_utf8, c1);
3462 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3463 }
c74f6de9 3464
79a2a0e8
KW
3465 /* Invariants are stored in both the utf8 and byte outputs; Use
3466 * negative numbers otherwise for the byte ones. Make sure that the
3467 * byte ones are the same iff the utf8 ones are the same */
3468 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3469 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3470 ? *c2_utf8
3471 : (c1 == c2)
3472 ? CHRTEST_NOT_A_CP_1
3473 : CHRTEST_NOT_A_CP_2;
3474 }
3475 else if (c1 > 255) {
3476 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3477 can represent */
3478 return FALSE;
3479 }
c74f6de9 3480
79a2a0e8
KW
3481 *c1p = *c2p = c2; /* c2 is the only representable value */
3482 }
3483 else { /* c1 is representable; see about c2 */
3484 *c1p = c1;
3485 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3486 }
2f554ef7 3487
c74f6de9
KW
3488 return TRUE;
3489}
2f554ef7 3490
f73aaa43
DM
3491/* returns -1 on failure, $+[0] on success */
3492STATIC I32
3493S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3494{
a35a87e7 3495#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3496 dMY_CXT;
3497#endif
27da23d5 3498 dVAR;
eb578fdb 3499 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3500 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3501 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3502 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3503 RXi_GET_DECL(rex,rexi);
2f554ef7 3504 I32 oldsave;
5d9a96ca 3505 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3506 regmatch_state *st;
5d9a96ca 3507 /* cache heavy used fields of st in registers */
eb578fdb
KW
3508 regnode *scan;
3509 regnode *next;
3510 U32 n = 0; /* general value; init to avoid compiler warning */
3511 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3512 char *locinput = startpos;
4d5016e5 3513 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3514 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3515
b69b0499 3516 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3517 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3518 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3519 const U32 max_nochange_depth =
3520 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3521 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3522 regmatch_state *yes_state = NULL; /* state to pop to on success of
3523 subpattern */
e2e6a0f1
YO
3524 /* mark_state piggy backs on the yes_state logic so that when we unwind
3525 the stack on success we can update the mark_state as we go */
3526 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3527 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3528 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3529 U32 state_num;
5d458dd8
YO
3530 bool no_final = 0; /* prevent failure from backtracking? */
3531 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3532 char *startpoint = locinput;
5d458dd8
YO
3533 SV *popmark = NULL; /* are we looking for a mark? */
3534 SV *sv_commit = NULL; /* last mark name seen in failure */
3535 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3536 during a successful match */
5d458dd8
YO
3537 U32 lastopen = 0; /* last open we saw */
3538 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3539 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3540 /* these three flags are set by various ops to signal information to
3541 * the very next op. They have a useful lifetime of exactly one loop
3542 * iteration, and are not preserved or restored by state pushes/pops
3543 */
3544 bool sw = 0; /* the condition value in (?(cond)a|b) */
3545 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3546 int logical = 0; /* the following EVAL is:
3547 0: (?{...})
3548 1: (?(?{...})X|Y)
3549 2: (??{...})
3550 or the following IFMATCH/UNLESSM is:
3551 false: plain (?=foo)
3552 true: used as a condition: (?(?=foo))
3553 */
81ed78b2
DM
3554 PAD* last_pad = NULL;
3555 dMULTICALL;
3556 I32 gimme = G_SCALAR;
3557 CV *caller_cv = NULL; /* who called us */
3558 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3559 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3560 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
3561 int to_complement; /* Invert the result? */
3562 _char_class_number classnum;
984e6dd1 3563 bool is_utf8_pat = reginfo->is_utf8_pat;
81ed78b2 3564
95b24440 3565#ifdef DEBUGGING
e68ec53f 3566 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3567#endif
3568
81ed78b2
DM
3569 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3570 multicall_oldcatch = 0;
3571 multicall_cv = NULL;
3572 cx = NULL;
4f8dbb2d
JL
3573 PERL_UNUSED_VAR(multicall_cop);
3574 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3575
3576
7918f24d
NC
3577 PERL_ARGS_ASSERT_REGMATCH;
3578
3b57cd43 3579 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3580 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3581 }));
5d9a96ca
DM
3582 /* on first ever call to regmatch, allocate first slab */
3583 if (!PL_regmatch_slab) {
3584 Newx(PL_regmatch_slab, 1, regmatch_slab);
3585 PL_regmatch_slab->prev = NULL;
3586 PL_regmatch_slab->next = NULL;
86545054 3587 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3588 }
3589
2f554ef7
DM
3590 oldsave = PL_savestack_ix;
3591 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3592 SAVEVPTR(PL_regmatch_slab);
3593 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3594
3595 /* grab next free state slot */
3596 st = ++PL_regmatch_state;
86545054 3597 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3598 st = PL_regmatch_state = S_push_slab(aTHX);
3599
d6a28714 3600 /* Note that nextchr is a byte even in UTF */
7016d6eb 3601 SET_nextchr;
d6a28714
JH
3602 scan = prog;
3603 while (scan != NULL) {
8ba1375e 3604
a3621e74 3605 DEBUG_EXECUTE_r( {
6136c704 3606 SV * const prop = sv_newmortal();
1de06328 3607 regnode *rnext=regnext(scan);
f2ed9b32 3608 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3609 regprop(rex, prop, scan);
07be1b83
YO
3610
3611 PerlIO_printf(Perl_debug_log,
3612 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3613 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3614 SvPVX_const(prop),
1de06328 3615 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3616 0 : (IV)(rnext - rexi->program));
2a782b5b 3617 });
d6a28714
JH
3618
3619 next = scan + NEXT_OFF(scan);
3620 if (next == scan)
3621 next = NULL;
40a82448 3622 state_num = OP(scan);
d6a28714 3623
40a82448 3624 reenter_switch:
3018b823 3625 to_complement = 0;
34a81e2b 3626
7016d6eb 3627 SET_nextchr;
e6ca698c 3628 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 3629
40a82448 3630 switch (state_num) {
3c0563b9 3631 case BOL: /* /^../ */
7fba1cd6 3632 if (locinput == PL_bostr)
d6a28714 3633 {
3b0527fe 3634 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3635 break;
3636 }
d6a28714 3637 sayNO;
3c0563b9
DM
3638
3639 case MBOL: /* /^../m */
12d33761 3640 if (locinput == PL_bostr ||
7016d6eb 3641 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3642 {
b8c5462f
JH
3643 break;
3644 }
d6a28714 3645 sayNO;
3c0563b9
DM
3646
3647 case SBOL: /* /^../s */
c2a73568 3648 if (locinput == PL_bostr)
b8c5462f 3649 break;
d6a28714 3650 sayNO;
3c0563b9
DM
3651
3652 case GPOS: /* \G */
3b0527fe 3653 if (locinput == reginfo->ganch)
d6a28714
JH
3654 break;
3655 sayNO;
ee9b8eae 3656
3c0563b9 3657 case KEEPS: /* \K */
ee9b8eae 3658 /* update the startpoint */
b93070ed 3659 st->u.keeper.val = rex->offs[0].start;
b93070ed 3660 rex->offs[0].start = locinput - PL_bostr;
4d5016e5 3661 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
878531d3 3662 assert(0); /*NOTREACHED*/
ee9b8eae
YO
3663 case KEEPS_next_fail:
3664 /* rollback the start point change */
b93070ed 3665 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 3666 sayNO_SILENT;
878531d3 3667 assert(0); /*NOTREACHED*/
3c0563b9
DM
3668
3669 case EOL: /* /..$/ */
d6a28714 3670 goto seol;
3c0563b9
DM
3671
3672 case MEOL: /* /..$/m */
7016d6eb 3673 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3674 sayNO;
b8c5462f 3675 break;
3c0563b9
DM
3676
3677 case SEOL: /* /..$/s */
d6a28714 3678 seol:
7016d6eb 3679 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3680 sayNO;
d6a28714 3681 if (PL_regeol - locinput > 1)
b8c5462f 3682 sayNO;
b8c5462f 3683 break;
3c0563b9
DM
3684
3685 case EOS: /* \z */
7016d6eb 3686 if (!NEXTCHR_IS_EOS)
b8c5462f 3687 sayNO;
d6a28714 3688 break;
3c0563b9
DM
3689
3690 case SANY: /* /./s */
7016d6eb 3691 if (NEXTCHR_IS_EOS)
4633a7c4 3692 sayNO;
28b98f76 3693 goto increment_locinput;
3c0563b9
DM
3694
3695 case CANY: /* \C */
7016d6eb 3696 if (NEXTCHR_IS_EOS)
f33976b4 3697 sayNO;
3640db6b 3698 locinput++;
a0d0e21e 3699 break;
3c0563b9
DM
3700
3701 case REG_ANY: /* /./ */
7016d6eb 3702 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 3703 sayNO;
28b98f76
DM
3704 goto increment_locinput;
3705
166ba7cd
DM
3706
3707#undef ST
3708#define ST st->u.trie
3c0563b9 3709 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
3710 /* In this case the charclass data is available inline so
3711 we can fail fast without a lot of extra overhead.
3712 */
7016d6eb 3713 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3714 DEBUG_EXECUTE_r(
3715 PerlIO_printf(Perl_debug_log,
3716 "%*s %sfailed to match trie start class...%s\n",
3717 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3718 );
3719 sayNO_SILENT;
118e2215 3720 assert(0); /* NOTREACHED */
786e8c11
YO
3721 }
3722 /* FALL THROUGH */
3c0563b9 3723 case TRIE: /* (ab|cd) */
2e64971a
DM
3724 /* the basic plan of execution of the trie is:
3725 * At the beginning, run though all the states, and
3726 * find the longest-matching word. Also remember the position
3727 * of the shortest matching word. For example, this pattern:
3728 * 1 2 3 4 5
3729 * ab|a|x|abcd|abc
3730 * when matched against the string "abcde", will generate
3731 * accept states for all words except 3, with the longest
895cc420 3732 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
3733 * the position being after char 1 of the string).
3734 *
3735 * Then for each matching word, in word order (i.e. 1,2,4,5),
3736 * we run the remainder of the pattern; on each try setting
3737 * the current position to the character following the word,
3738 * returning to try the next word on failure.
3739 *
3740 * We avoid having to build a list of words at runtime by
3741 * using a compile-time structure, wordinfo[].prev, which
3742 * gives, for each word, the previous accepting word (if any).
3743 * In the case above it would contain the mappings 1->2, 2->0,
3744 * 3->0, 4->5, 5->1. We can use this table to generate, from
3745 * the longest word (4 above), a list of all words, by
3746 * following the list of prev pointers; this gives us the
3747 * unordered list 4,5,1,2. Then given the current word we have
3748 * just tried, we can go through the list and find the
3749 * next-biggest word to try (so if we just failed on word 2,
3750 * the next in the list is 4).
3751 *
3752 * Since at runtime we don't record the matching position in
3753 * the string for each word, we have to work that out for
3754 * each word we're about to process. The wordinfo table holds
3755 * the character length of each word; given that we recorded
3756 * at the start: the position of the shortest word and its
3757 * length in chars, we just need to move the pointer the
3758 * difference between the two char lengths. Depending on
3759 * Unicode status and folding, that's cheap or expensive.
3760 *
3761 * This algorithm is optimised for the case where are only a
3762 * small number of accept states, i.e. 0,1, or maybe 2.
3763 * With lots of accepts states, and having to try all of them,
3764 * it becomes quadratic on number of accept states to find all
3765 * the next words.
3766 */
3767
3dab1dad 3768 {
07be1b83 3769 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3770 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3771
3772 /* what trie are we using right now */
be8e71aa 3773 reg_trie_data * const trie
f8fc2ecf 3774 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3775 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3776 U32 state = trie->startstate;
166ba7cd 3777
7016d6eb
DM
3778 if ( trie->bitmap
3779 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3780 {
3dab1dad
YO
3781 if (trie->states[ state ].wordnum) {
3782 DEBUG_EXECUTE_r(
3783 PerlIO_printf(Perl_debug_log,
3784 "%*s %smatched empty string...%s\n",
5bc10b2c 3785 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3786 );
20dbff7c
YO
3787 if (!trie->jump)
3788 break;
3dab1dad
YO
3789 } else {
3790 DEBUG_EXECUTE_r(
3791 PerlIO_printf(Perl_debug_log,
786e8c11 3792 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3793 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3794 );
3795 sayNO_SILENT;
3796 }
3797 }
166ba7cd 3798
786e8c11
YO
3799 {
3800 U8 *uc = ( U8* )locinput;
3801
3802 STRLEN len = 0;
3803 STRLEN foldlen = 0;
3804 U8 *uscan = (U8*)NULL;
786e8c11 3805 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3806 U32 charcount = 0; /* how many input chars we have matched */
3807 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3808
786e8c11 3809 ST.jump = trie->jump;
786e8c11 3810 ST.me = scan;
2e64971a
DM
3811 ST.firstpos = NULL;
3812 ST.longfold = FALSE; /* char longer if folded => it's harder */
3813 ST.nextword = 0;
3814
3815 /* fully traverse the TRIE; note the position of the
3816 shortest accept state and the wordnum of the longest
3817 accept state */
07be1b83 3818
a3621e74 3819 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3820 U32 base = trie->states[ state ].trans.base;
f9f4320a 3821 UV uvc = 0;
acb909b4 3822 U16 charid = 0;
2e64971a
DM
3823 U16 wordnum;
3824 wordnum = trie->states[ state ].wordnum;
3825
3826 if (wordnum) { /* it's an accept state */
3827 if (!accepted) {
3828 accepted = 1;
3829 /* record first match position */
3830 if (ST.longfold) {
3831 ST.firstpos = (U8*)locinput;
3832 ST.firstchars = 0;
5b47454d 3833 }
2e64971a
DM
3834 else {
3835 ST.firstpos = uc;
3836 ST.firstchars = charcount;
3837 }
3838 }
3839 if (!ST.nextword || wordnum < ST.nextword)
3840 ST.nextword = wordnum;
3841 ST.topword = wordnum;
786e8c11 3842 }
a3621e74 3843
07be1b83 3844 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3845 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3846 PerlIO_printf( Perl_debug_log,
2e64971a 3847 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3848 2+depth * 2, "", PL_colors[4],
2e64971a 3849 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3850 });
a3621e74 3851
2e64971a 3852 /* read a char and goto next state */
7016d6eb 3853 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
6dd2be57 3854 I32 offset;
55eed653
NC
3855 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3856 uscan, len, uvc, charid, foldlen,
3857 foldbuf, uniflags);
2e64971a
DM
3858 charcount++;
3859 if (foldlen>0)
3860 ST.longfold = TRUE;
5b47454d 3861 if (charid &&
6dd2be57
DM
3862 ( ((offset =
3863 base + charid - 1 - trie->uniquecharcount)) >= 0)
3864
3865 && ((U32)offset < trie->lasttrans)
3866 && trie->trans[offset].check == state)
5b47454d 3867 {
6dd2be57 3868 state = trie->trans[offset].next;
5b47454d
DM
3869 }
3870 else {
3871 state = 0;
3872 }
3873 uc += len;
3874
3875 }
3876 else {
a3621e74
YO
3877 state = 0;
3878 }
3879 DEBUG_TRIE_EXECUTE_r(
e4584336 3880 PerlIO_printf( Perl_debug_log,
786e8c11 3881 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3882 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3883 );
3884 }
2e64971a 3885 if (!accepted)
a3621e74 3886 sayNO;
a3621e74 3887
2e64971a
DM
3888 /* calculate total number of accept states */
3889 {
3890 U16 w = ST.topword;
3891 accepted = 0;
3892 while (w) {
3893 w = trie->wordinfo[w].prev;
3894 accepted++;
3895 }
3896 ST.accepted = accepted;
3897 }
3898
166ba7cd
DM
3899 DEBUG_EXECUTE_r(
3900 PerlIO_printf( Perl_debug_log,
3901 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3902 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3903 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3904 );
2e64971a 3905 goto trie_first_try; /* jump into the fail handler */
786e8c11 3906 }}
118e2215 3907 assert(0); /* NOTREACHED */
2e64971a
DM
3908
3909 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
3910 {
3911 U8 *uc;
fae667d5
YO
3912 if ( ST.jump) {
3913 REGCP_UNWIND(ST.cp);
a8d1f4b4 3914 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3915 }
2e64971a
DM
3916 if (!--ST.accepted) {
3917 DEBUG_EXECUTE_r({
3918 PerlIO_printf( Perl_debug_log,
3919 "%*s %sTRIE failed...%s\n",
3920 REPORT_CODE_OFF+depth*2, "",
3921 PL_colors[4],
3922 PL_colors[5] );
3923 });
3924 sayNO_SILENT;
3925 }
3926 {
3927 /* Find next-highest word to process. Note that this code
3928 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
3929 U16 min = 0;
3930 U16 word;
3931 U16 const nextword = ST.nextword;
3932 reg_trie_wordinfo * const wordinfo
2e64971a
DM
3933 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3934 for (word=ST.topword; word; word=wordinfo[word].prev) {
3935 if (word > nextword && (!min || word < min))
3936 min = word;
3937 }
3938 ST.nextword = min;
3939 }
3940
fae667d5 3941 trie_first_try:
5d458dd8
YO
3942 if (do_cutgroup) {
3943 do_cutgroup = 0;
3944 no_final = 0;
3945 }
fae667d5
YO
3946
3947 if ( ST.jump) {
b93070ed 3948 ST.lastparen = rex->lastparen;
f6033a9d 3949 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3950 REGCP_SET(ST.cp);
2e64971a 3951 }
a3621e74 3952
2e64971a 3953 /* find start char of end of current word */
166ba7cd 3954 {
2e64971a 3955 U32 chars; /* how many chars to skip */
2e64971a
DM
3956 reg_trie_data * const trie
3957 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3958
3959 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3960 >= ST.firstchars);
3961 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3962 - ST.firstchars;
a059a757 3963 uc = ST.firstpos;
2e64971a
DM
3964
3965 if (ST.longfold) {
3966 /* the hard option - fold each char in turn and find
3967 * its folded length (which may be different */
3968 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3969 STRLEN foldlen;
3970 STRLEN len;
d9a396a3 3971 UV uvc;
2e64971a
DM
3972 U8 *uscan;
3973
3974 while (chars) {
f2ed9b32 3975 if (utf8_target) {
2e64971a
DM
3976 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3977 uniflags);
3978 uc += len;
3979 }
3980 else {
3981 uvc = *uc;
3982 uc++;
3983 }
3984 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3985 uscan = foldbuf;
3986 while (foldlen) {
3987 if (!--chars)
3988 break;
3989 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3990 uniflags);
3991 uscan += len;
3992 foldlen -= len;
3993 }
3994 }
a3621e74 3995 }
2e64971a 3996 else {
f2ed9b32 3997 if (utf8_target)
2e64971a
DM
3998 while (chars--)
3999 uc += UTF8SKIP(uc);
4000 else
4001 uc += chars;
4002 }
2e64971a 4003 }
166ba7cd 4004
6603fe3e
DM
4005 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4006 ? ST.jump[ST.nextword]
4007 : NEXT_OFF(ST.me));
166ba7cd 4008
2e64971a
DM
4009 DEBUG_EXECUTE_r({
4010 PerlIO_printf( Perl_debug_log,
4011 "%*s %sTRIE matched word #%d, continuing%s\n",
4012 REPORT_CODE_OFF+depth*2, "",
4013 PL_colors[4],
4014 ST.nextword,
4015 PL_colors[5]
4016 );
4017 });
4018
4019 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4020 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 4021 assert(0); /* NOTREACHED */
166ba7cd 4022 }
2e64971a
DM
4023 /* only one choice left - just continue */
4024 DEBUG_EXECUTE_r({
4025 AV *const trie_words
4026 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4027 SV ** const tmp = av_fetch( trie_words,
4028 ST.nextword-1, 0 );
4029 SV *sv= tmp ? sv_newmortal() : NULL;
4030
4031 PerlIO_printf( Perl_debug_log,
4032 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4033 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4034 ST.nextword,
4035 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4036 PL_colors[0], PL_colors[1],
c89df6cf 4037 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4038 )
4039 : "not compiled under -Dr",
4040 PL_colors[5] );
4041 });
4042
a059a757 4043 locinput = (char*)uc;
2e64971a 4044 continue; /* execute rest of RE */
118e2215 4045 assert(0); /* NOTREACHED */
a059a757 4046 }
166ba7cd
DM
4047#undef ST
4048
3c0563b9 4049 case EXACT: { /* /abc/ */
95b24440 4050 char *s = STRING(scan);
24d3c4a9 4051 ln = STR_LEN(scan);
984e6dd1 4052 if (utf8_target != is_utf8_pat) {
bc517b45 4053 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4054 char *l = locinput;
24d3c4a9 4055 const char * const e = s + ln;
a72c7584 4056
f2ed9b32 4057 if (utf8_target) {
e6a3850e
KW
4058 /* The target is utf8, the pattern is not utf8.
4059 * Above-Latin1 code points can't match the pattern;
4060 * invariants match exactly, and the other Latin1 ones need
4061 * to be downgraded to a single byte in order to do the
4062 * comparison. (If we could be confident that the target
4063 * is not malformed, this could be refactored to have fewer
4064 * tests by just assuming that if the first bytes match, it
4065 * is an invariant, but there are tests in the test suite
4066 * dealing with (??{...}) which violate this) */
1aa99e6b 4067 while (s < e) {
5ac65bff 4068 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
e6a3850e
KW
4069 sayNO;
4070 }
4071 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4072 if (*l != *s) {
4073 sayNO;
4074 }
4075 l++;
4076 }
4077 else {
4078 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4079 sayNO;
4080 }
4081 l += 2;
4082 }
4083 s++;
1aa99e6b 4084 }
5ff6fc6d
JH
4085 }
4086 else {
4087 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4088 while (s < e) {
e6a3850e
KW
4089 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4090 {
4091 sayNO;
4092 }
4093 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4094 if (*s != *l) {
4095 sayNO;
4096 }
4097 s++;
4098 }
4099 else {
4100 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4101 sayNO;
4102 }
4103 s += 2;
4104 }
4105 l++;
1aa99e6b 4106 }
5ff6fc6d 4107 }
1aa99e6b 4108 locinput = l;
1aa99e6b 4109 }
5ac65bff
KW
4110 else {
4111 /* The target and the pattern have the same utf8ness. */
4112 /* Inline the first character, for speed. */
4113 if (PL_regeol - locinput < ln
4114 || UCHARAT(s) != nextchr
4115 || (ln > 1 && memNE(s, locinput, ln)))
4116 {
4117 sayNO;
4118 }
4119 locinput += ln;
4120 }
d6a28714 4121 break;
95b24440 4122 }
7016d6eb 4123
3c0563b9 4124 case EXACTFL: { /* /abc/il */
a932d541 4125 re_fold_t folder;
9a5a5549
KW
4126 const U8 * fold_array;
4127 const char * s;
d513472c 4128 U32 fold_utf8_flags;
9a5a5549 4129
272d35c9 4130 RX_MATCH_TAINTED_on(reginfo->prog);
f67f9e53
KW
4131 folder = foldEQ_locale;
4132 fold_array = PL_fold_locale;
17580e7a 4133 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
4134 goto do_exactf;
4135
3c0563b9
DM
4136 case EXACTFU_SS: /* /\x{df}/iu */
4137 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4138 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4139 folder = foldEQ_latin1;
4140 fold_array = PL_fold_latin1;
984e6dd1 4141 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4142 goto do_exactf;
4143
3c0563b9 4144 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4145 folder = foldEQ_latin1;
4146 fold_array = PL_fold_latin1;
57014d77 4147 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4148 goto do_exactf;
4149
3c0563b9 4150 case EXACTF: /* /abc/i */
9a5a5549
KW
4151 folder = foldEQ;
4152 fold_array = PL_fold;
62bf7766 4153 fold_utf8_flags = 0;
9a5a5549
KW
4154
4155 do_exactf:
4156 s = STRING(scan);
24d3c4a9 4157 ln = STR_LEN(scan);
d6a28714 4158
984e6dd1 4159 if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
3c760661
KW
4160 /* Either target or the pattern are utf8, or has the issue where
4161 * the fold lengths may differ. */
be8e71aa 4162 const char * const l = locinput;
d07ddd77 4163 char *e = PL_regeol;
bc517b45 4164
984e6dd1 4165 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 4166 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4167 {
4168 sayNO;
5486206c 4169 }
d07ddd77 4170 locinput = e;
d07ddd77 4171 break;
a0ed51b3 4172 }
d6a28714 4173
0a138b74 4174 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4175 if (UCHARAT(s) != nextchr
4176 && !NEXTCHR_IS_EOS
4177 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4178 {
a0ed51b3 4179 sayNO;
9a5a5549 4180 }
24d3c4a9 4181 if (PL_regeol - locinput < ln)
b8c5462f 4182 sayNO;
9a5a5549 4183 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4184 sayNO;
24d3c4a9 4185 locinput += ln;
a0d0e21e 4186 break;
9a5a5549 4187 }
63ac0dad
KW
4188
4189 /* XXX Could improve efficiency by separating these all out using a
4190 * macro or in-line function. At that point regcomp.c would no longer
4191 * have to set the FLAGS fields of these */
3c0563b9
DM
4192 case BOUNDL: /* /\b/l */
4193 case NBOUNDL: /* /\B/l */
272d35c9 4194 RX_MATCH_TAINTED_on(reginfo->prog);
b2680017 4195 /* FALL THROUGH */
3c0563b9
DM
4196 case BOUND: /* /\b/ */
4197 case BOUNDU: /* /\b/u */
4198 case BOUNDA: /* /\b/a */
4199 case NBOUND: /* /\B/ */
4200 case NBOUNDU: /* /\B/u */
4201 case NBOUNDA: /* /\B/a */
b2680017 4202 /* was last char in word? */
f2e96b5d
KW
4203 if (utf8_target
4204 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4205 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4206 {
b2680017
YO
4207 if (locinput == PL_bostr)
4208 ln = '\n';
4209 else {
4210 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4211
4212 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4213 }
63ac0dad 4214 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
0eb30aeb 4215 ln = isWORDCHAR_uni(ln);
7016d6eb
DM
4216 if (NEXTCHR_IS_EOS)
4217 n = 0;
4218 else {
4219 LOAD_UTF8_CHARCLASS_ALNUM();
03940dc2 4220 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
7016d6eb
DM
4221 utf8_target);
4222 }
b2680017
YO
4223 }
4224 else {
0eb30aeb
KW
4225 ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4226 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
b2680017
YO
4227 }
4228 }
4229 else {
cfaf538b
KW
4230
4231 /* Here the string isn't utf8, or is utf8 and only ascii
4232 * characters are to match \w. In the latter case looking at
4233 * the byte just prior to the current one may be just the final
4234 * byte of a multi-byte character. This is ok. There are two
4235 * cases:
4236 * 1) it is a single byte character, and then the test is doing
4237 * just what it's supposed to.
4238 * 2) it is a multi-byte character, in which case the final
4239 * byte is never mistakable for ASCII, and so the test
4240 * will say it is not a word character, which is the
4241 * correct answer. */
b2680017
YO
4242 ln = (locinput != PL_bostr) ?
4243 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4244 switch (FLAGS(scan)) {
4245 case REGEX_UNICODE_CHARSET:
4246 ln = isWORDCHAR_L1(ln);
7016d6eb 4247 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4248 break;
4249 case REGEX_LOCALE_CHARSET:
0eb30aeb
KW
4250 ln = isWORDCHAR_LC(ln);
4251 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
63ac0dad
KW
4252 break;
4253 case REGEX_DEPENDS_CHARSET:
0eb30aeb
KW
4254 ln = isWORDCHAR(ln);
4255 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
63ac0dad 4256 break;
cfaf538b 4257 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4258 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4259 ln = isWORDCHAR_A(ln);
7016d6eb 4260 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4261 break;
63ac0dad
KW
4262 default:
4263 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4264 break;
b2680017
YO
4265 }
4266 }
63ac0dad
KW
4267 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4268 * regcomp.sym */
4269 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4270 sayNO;
4271 break;
3c0563b9 4272
3c0563b9 4273 case ANYOF: /* /[abc]/ */
954a2af6 4274 case ANYOF_WARN_SUPER:
7016d6eb
DM
4275 if (NEXTCHR_IS_EOS)
4276 sayNO;
e0193e47 4277 if (utf8_target) {
635cd5d4 4278 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
09b08e9b 4279 sayNO;
635cd5d4 4280 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
4281 }
4282 else {
20ed0b26 4283 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4284 sayNO;
3640db6b 4285 locinput++;
e0f9d4a8 4286 }
b8c5462f 4287 break;
3c0563b9 4288
3018b823
KW
4289 /* The argument (FLAGS) to all the POSIX node types is the class number
4290 * */
ee9a90b8 4291
3018b823
KW
4292 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4293 to_complement = 1;
4294 /* FALLTHROUGH */
4295
4296 case POSIXL: /* \w or [:punct:] etc. under /l */
4297 if (NEXTCHR_IS_EOS)
bedac28b 4298 sayNO;
bedac28b 4299
3018b823
KW
4300 /* The locale hasn't influenced the outcome before this, so defer
4301 * tainting until now */
272d35c9 4302 RX_MATCH_TAINTED_on(reginfo->prog);
3018b823
KW
4303
4304 /* Use isFOO_lc() for characters within Latin1. (Note that
4305 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4306 * wouldn't be invariant) */
4307 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 4308 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
4309 sayNO;
4310 }
4311 }
3018b823
KW
4312 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4313 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
eb4e9c04 4314 (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
3018b823
KW
4315 *(locinput + 1))))))
4316 {
bedac28b 4317 sayNO;
3018b823 4318 }
bedac28b 4319 }
3018b823
KW
4320 else { /* Here, must be an above Latin-1 code point */
4321 goto utf8_posix_not_eos;
bedac28b 4322 }
3018b823
KW
4323
4324 /* Here, must be utf8 */
4325 locinput += UTF8SKIP(locinput);
bedac28b
KW
4326 break;
4327
3018b823
KW
4328 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4329 to_complement = 1;
4330 /* FALLTHROUGH */
4331
4332 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 4333 if (utf8_target) {
3018b823 4334 goto utf8_posix;
bedac28b 4335 }
3018b823
KW
4336 goto posixa;
4337
4338 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 4339
3018b823 4340 if (NEXTCHR_IS_EOS) {
bedac28b
KW
4341 sayNO;
4342 }
bedac28b 4343
3018b823
KW
4344 /* All UTF-8 variants match */
4345 if (! UTF8_IS_INVARIANT(nextchr)) {
4346 goto increment_locinput;
bedac28b 4347 }
ee9a90b8 4348
3018b823
KW
4349 to_complement = 1;
4350 /* FALLTHROUGH */
4351
4352 case POSIXA: /* \w or [:punct:] etc. under /a */
4353
4354 posixa:
4355 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4356 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4357 * character is a single byte */
20d0b1e9 4358
3018b823
KW
4359 if (NEXTCHR_IS_EOS
4360 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4361 FLAGS(scan)))))
4362 {
0658cdde
KW
4363 sayNO;
4364 }
3018b823
KW
4365
4366 /* Here we are either not in utf8, or we matched a utf8-invariant,
4367 * so the next char is the next byte */
3640db6b 4368 locinput++;
0658cdde 4369 break;
3c0563b9 4370
3018b823
KW
4371 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4372 to_complement = 1;
4373 /* FALLTHROUGH */
4374
4375 case POSIXU: /* \w or [:punct:] etc. under /u */
4376 utf8_posix:
4377 if (NEXTCHR_IS_EOS) {
0658cdde
KW
4378 sayNO;
4379 }
3018b823
KW
4380 utf8_posix_not_eos:
4381
4382 /* Use _generic_isCC() for characters within Latin1. (Note that
4383 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4384 * wouldn't be invariant) */
4385 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4386 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4387 FLAGS(scan)))))
4388 {
4389 sayNO;
4390 }
4391 locinput++;
4392 }
4393 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4394 if (! (to_complement
4395 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4396 *(locinput + 1)),
4397 FLAGS(scan)))))
4398 {
4399 sayNO;
4400 }
4401 locinput += 2;
4402 }
4403 else { /* Handle above Latin-1 code points */
4404 classnum = (_char_class_number) FLAGS(scan);
4405 if (classnum < _FIRST_NON_SWASH_CC) {
4406
4407 /* Here, uses a swash to find such code points. Load if if
4408 * not done already */
4409 if (! PL_utf8_swash_ptrs[classnum]) {
4410 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4411 PL_utf8_swash_ptrs[classnum]
4412 = _core_swash_init("utf8",
4413 swash_property_names[classnum],
4414 &PL_sv_undef, 1, 0, NULL, &flags);
4415 }
4416 if (! (to_complement
4417 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4418 (U8 *) locinput, TRUE))))
4419 {
4420 sayNO;
4421 }
4422 }
4423 else { /* Here, uses macros to find above Latin-1 code points */
4424 switch (classnum) {
4425 case _CC_ENUM_SPACE: /* XXX would require separate
4426 code if we revert the change
4427 of \v matching this */
4428 case _CC_ENUM_PSXSPC:
4429 if (! (to_complement
4430 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4431 {
4432 sayNO;
4433 }
4434 break;
4435 case _CC_ENUM_BLANK:
4436 if (! (to_complement
4437 ^ cBOOL(is_HORIZWS_high(locinput))))
4438 {
4439 sayNO;
4440 }
4441 break;
4442 case _CC_ENUM_XDIGIT:
4443 if (! (to_complement
4444 ^ cBOOL(is_XDIGIT_high(locinput))))
4445 {
4446 sayNO;
4447 }
4448 break;
4449 case _CC_ENUM_VERTSPACE:
4450 if (! (to_complement
4451 ^ cBOOL(is_VERTWS_high(locinput))))
4452 {
4453 sayNO;
4454 }
4455 break;
4456 default: /* The rest, e.g. [:cntrl:], can't match
4457 above Latin1 */
4458 if (! to_complement) {
4459 sayNO;
4460 }
4461 break;
4462 }
4463 }
4464 locinput += UTF8SKIP(locinput);
4465 }
4466 break;
0658cdde 4467
37e2e78e
KW
4468 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4469 a Unicode extended Grapheme Cluster */
4470 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4471 extended Grapheme Cluster is:
4472
7aee35ff
KW
4473 CR LF
4474 | Prepend* Begin Extend*
4475 | .
37e2e78e 4476
7aee35ff
KW
4477 Begin is: ( Special_Begin | ! Control )
4478 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4479 Extend is: ( Grapheme_Extend | Spacing_Mark )
4480 Control is: [ GCB_Control | CR | LF ]
4481 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4482
27d4fc33
KW
4483 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4484 we can rewrite
4485
4486 Begin is ( Regular_Begin + Special Begin )
4487
4488 It turns out that 98.4% of all Unicode code points match
4489 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4490 the previous implementation for almost all Unicode code points.
27d4fc33 4491
37e2e78e
KW
4492 There is a subtlety with Prepend* which showed up in testing.
4493 Note that the Begin, and only the Begin is required in:
4494 | Prepend* Begin Extend*
cc3b396d
KW
4495 Also, Begin contains '! Control'. A Prepend must be a
4496 '! Control', which means it must also be a Begin. What it
4497 comes down to is that if we match Prepend* and then find no
4498 suitable Begin afterwards, that if we backtrack the last
4499 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4500 */
4501
7016d6eb 4502 if (NEXTCHR_IS_EOS)
a0ed51b3 4503 sayNO;
f2ed9b32 4504 if (! utf8_target) {
37e2e78e
KW
4505
4506 /* Match either CR LF or '.', as all the other possibilities
4507 * require utf8 */
4508 locinput++; /* Match the . or CR */
cc3b396d
KW
4509 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4510 match the LF */
37e2e78e 4511 && locinput < PL_regeol
e699a1d5
KW
4512 && UCHARAT(locinput) == '\n')
4513 {
4514 locinput++;
4515 }
37e2e78e
KW
4516 }
4517 else {
4518
4519 /* Utf8: See if is ( CR LF ); already know that locinput <
4520 * PL_regeol, so locinput+1 is in bounds */
7016d6eb 4521 if ( nextchr == '\r' && locinput+1 < PL_regeol
f67f9e53 4522 && UCHARAT(locinput + 1) == '\n')
7016d6eb 4523 {
37e2e78e
KW
4524 locinput += 2;
4525 }
4526 else {
45fdf108
KW
4527 STRLEN len;
4528
37e2e78e
KW
4529 /* In case have to backtrack to beginning, then match '.' */
4530 char *starting = locinput;
4531
4532 /* In case have to backtrack the last prepend */
e699a1d5 4533 char *previous_prepend = NULL;
37e2e78e
KW
4534
4535 LOAD_UTF8_CHARCLASS_GCB();
4536
45fdf108
KW
4537 /* Match (prepend)* */
4538 while (locinput < PL_regeol
4539 && (len = is_GCB_Prepend_utf8(locinput)))
4540 {
4541 previous_prepend = locinput;
4542 locinput += len;
a1853d78 4543 }
37e2e78e
KW
4544
4545 /* As noted above, if we matched a prepend character, but
4546 * the next thing won't match, back off the last prepend we
4547 * matched, as it is guaranteed to match the begin */
4548 if (previous_prepend
4549 && (locinput >= PL_regeol
c101f46d
KW
4550 || (! swash_fetch(PL_utf8_X_regular_begin,
4551 (U8*)locinput, utf8_target)
bff53399 4552 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
c101f46d 4553 )
37e2e78e
KW
4554 {
4555 locinput = previous_prepend;
4556 }
4557
4558 /* Note that here we know PL_regeol > locinput, as we
4559 * tested that upon input to this switch case, and if we
4560 * moved locinput forward, we tested the result just above
4561 * and it either passed, or we backed off so that it will
4562 * now pass */
11dfcd49
KW
4563 if (swash_fetch(PL_utf8_X_regular_begin,
4564 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4565 locinput += UTF8SKIP(locinput);
4566 }
bff53399 4567 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
37e2e78e
KW
4568
4569 /* Here did not match the required 'Begin' in the
4570 * second term. So just match the very first
4571 * character, the '.' of the final term of the regex */
4572 locinput = starting + UTF8SKIP(starting);
27d4fc33 4573 goto exit_utf8;
37e2e78e
KW
4574 } else {
4575
11dfcd49
KW
4576 /* Here is a special begin. It can be composed of
4577 * several individual characters. One possibility is
4578 * RI+ */
45fdf108
KW
4579 if ((len = is_GCB_RI_utf8(locinput))) {
4580 locinput += len;
11dfcd49 4581 while (locinput < PL_regeol
45fdf108 4582 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4583 {
45fdf108 4584 locinput += len;
11dfcd49 4585 }
45fdf108
KW
4586 } else if ((len = is_GCB_T_utf8(locinput))) {
4587 /* Another possibility is T+ */
4588 locinput += len;
11dfcd49 4589 while (locinput < PL_regeol
45fdf108 4590 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4591 {
45fdf108 4592 locinput += len;
11dfcd49
KW
4593 }
4594 } else {
4595
4596 /* Here, neither RI+ nor T+; must be some other
4597 * Hangul. That means it is one of the others: L,
4598 * LV, LVT or V, and matches:
4599 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4600
4601 /* Match L* */
4602 while (locinput < PL_regeol
45fdf108 4603 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4604 {
45fdf108 4605 locinput += len;
11dfcd49 4606 }
37e2e78e 4607
11dfcd49
KW
4608 /* Here, have exhausted L*. If the next character
4609 * is not an LV, LVT nor V, it means we had to have
4610 * at least one L, so matches L+ in the original
4611 * equation, we have a complete hangul syllable.
4612 * Are done. */
4613
4614 if (locinput < PL_regeol
45fdf108 4615 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49 4616 {
11dfcd49 4617 /* Otherwise keep going. Must be LV, LVT or V.
7d43c479
KW
4618 * See if LVT, by first ruling out V, then LV */
4619 if (! is_GCB_V_utf8(locinput)
4620 /* All but every TCount one is LV */
4621 && (valid_utf8_to_uvchr((U8 *) locinput,
4622 NULL)
4623 - SBASE)
4624 % TCount != 0)
4625 {
11dfcd49
KW
4626 locinput += UTF8SKIP(locinput);
4627 } else {
4628
4629 /* Must be V or LV. Take it, then match
4630 * V* */
4631 locinput += UTF8SKIP(locinput);
4632 while (locinput < PL_regeol
45fdf108 4633 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4634 {
45fdf108 4635 locinput += len;
11dfcd49
KW
4636 }
4637 }
37e2e78e 4638
11dfcd49 4639 /* And any of LV, LVT, or V can be followed
45fdf108 4640 * by T* */
11dfcd49 4641 while (locinput < PL_regeol
45fdf108 4642 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4643 {
45fdf108 4644 locinput += len;
11dfcd49
KW
4645 }
4646 }
cd94d768 4647 }
11dfcd49 4648 }
37e2e78e 4649
11dfcd49
KW
4650 /* Match any extender */
4651 while (locinput < PL_regeol
4652 && swash_fetch(PL_utf8_X_extend,
4653 (U8*)locinput, utf8_target))
4654 {
4655 locinput += UTF8SKIP(locinput);
4656 }
37e2e78e 4657 }
27d4fc33 4658 exit_utf8:
37e2e78e
KW
4659 if (locinput > PL_regeol) sayNO;
4660 }
a0ed51b3 4661 break;
81714fb9 4662
3c0563b9 4663 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
4664 { /* The capture buffer cases. The ones beginning with N for the
4665 named buffers just convert to the equivalent numbered and
4666 pretend they were called as the corresponding numbered buffer
4667 op. */
26ecd678
TC
4668 /* don't initialize these in the declaration, it makes C++
4669 unhappy */
81714fb9 4670 char *s;
ff1157ca 4671 char type;
8368298a
TC
4672 re_fold_t folder;
4673 const U8 *fold_array;
26ecd678 4674 UV utf8_fold_flags;
8368298a 4675
272d35c9 4676 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4677 folder = foldEQ_locale;
4678 fold_array = PL_fold_locale;
4679 type = REFFL;
17580e7a 4680 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4681 goto do_nref;
4682
3c0563b9 4683 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
4684 folder = foldEQ_latin1;
4685 fold_array = PL_fold_latin1;
4686 type = REFFA;
4687 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4688 goto do_nref;
4689
3c0563b9 4690 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
4691 folder = foldEQ_latin1;
4692 fold_array = PL_fold_latin1;
4693 type = REFFU;
d513472c 4694 utf8_fold_flags = 0;
d7ef4b73
KW
4695 goto do_nref;
4696
3c0563b9 4697 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
4698 folder = foldEQ;
4699 fold_array = PL_fold;
4700 type = REFF;
d513472c 4701 utf8_fold_flags = 0;
d7ef4b73
KW
4702 goto do_nref;
4703
3c0563b9 4704 case NREF: /* /\g{name}/ */
d7ef4b73 4705 type = REF;
83d7b90b
KW
4706 folder = NULL;
4707 fold_array = NULL;
d513472c 4708 utf8_fold_flags = 0;
d7ef4b73
KW
4709 do_nref:
4710
4711 /* For the named back references, find the corresponding buffer
4712 * number */
0a4db386
YO
4713 n = reg_check_named_buff_matched(rex,scan);
4714
d7ef4b73 4715 if ( ! n ) {
81714fb9 4716 sayNO;
d7ef4b73
KW
4717 }
4718 goto do_nref_ref_common;
4719
3c0563b9 4720 case REFFL: /* /\1/il */
272d35c9 4721 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4722 folder = foldEQ_locale;
4723 fold_array = PL_fold_locale;
17580e7a 4724 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4725 goto do_ref;
4726
3c0563b9 4727 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
4728 folder = foldEQ_latin1;
4729 fold_array = PL_fold_latin1;
4730 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4731 goto do_ref;
4732
3c0563b9 4733 case REFFU: /* /\1/iu */
d7ef4b73
KW
4734 folder = foldEQ_latin1;
4735 fold_array = PL_fold_latin1;
d513472c 4736 utf8_fold_flags = 0;
d7ef4b73
KW
4737 goto do_ref;
4738
3c0563b9 4739 case REFF: /* /\1/i */
d7ef4b73
KW
4740 folder = foldEQ;
4741 fold_array = PL_fold;
d513472c 4742 utf8_fold_flags = 0;
83d7b90b 4743 goto do_ref;
d7ef4b73 4744
3c0563b9 4745 case REF: /* /\1/ */
83d7b90b
KW
4746 folder = NULL;
4747 fold_array = NULL;
d513472c 4748 utf8_fold_flags = 0;
83d7b90b 4749
d7ef4b73 4750 do_ref:
81714fb9 4751 type = OP(scan);
d7ef4b73
KW
4752 n = ARG(scan); /* which paren pair */
4753
4754 do_nref_ref_common:
b93070ed 4755 ln = rex->offs[n].start;
2c2d71f5 4756 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4757 if (rex->lastparen < n || ln == -1)
af3f8c16 4758 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4759 if (ln == rex->offs[n].end)
a0d0e21e 4760 break;
a0ed51b3 4761
24d3c4a9 4762 s = PL_bostr + ln;
d7ef4b73 4763 if (type != REF /* REF can do byte comparison */
2f65c56d 4764 && (utf8_target || type == REFFU))
d7ef4b73
KW
4765 { /* XXX handle REFFL better */
4766 char * limit = PL_regeol;
4767
4768 /* This call case insensitively compares the entire buffer
4769 * at s, with the current input starting at locinput, but
4770 * not going off the end given by PL_regeol, and returns in
2db97d41 4771 * <limit> upon success, how much of the current input was
d7ef4b73 4772 * matched */
b93070ed 4773 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4774 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4775 {
4776 sayNO;
a0ed51b3 4777 }
d7ef4b73 4778 locinput = limit;
a0ed51b3
LW
4779 break;
4780 }
4781
d7ef4b73 4782 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
4783 if (!NEXTCHR_IS_EOS &&
4784 UCHARAT(s) != nextchr &&
81714fb9 4785 (type == REF ||
d7ef4b73 4786 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4787 sayNO;
b93070ed 4788 ln = rex->offs[n].end - ln;
24d3c4a9 4789 if (locinput + ln > PL_regeol)
4633a7c4 4790 sayNO;
81714fb9 4791 if (ln > 1 && (type == REF
24d3c4a9 4792 ? memNE(s, locinput, ln)
d7ef4b73 4793 : ! folder(s, locinput, ln)))
4633a7c4 4794 sayNO;
24d3c4a9 4795 locinput += ln;
a0d0e21e 4796 break;
81714fb9 4797 }
3c0563b9
DM
4798
4799 case NOTHING: /* null op; e.g. the 'nothing' following
4800 * the '*' in m{(a+|b)*}' */
4801 break;
4802 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 4803 break;
3c0563b9
DM
4804
4805 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 4806 break;
40a82448
DM
4807
4808#undef ST
4809#define ST st->u.eval
c277df42 4810 {
c277df42 4811 SV *ret;
d2f13c59 4812 REGEXP *re_sv;
6bda09f9 4813 regexp *re;
f8fc2ecf 4814 regexp_internal *rei;
1a147d38
YO
4815 regnode *startpoint;
4816
3c0563b9 4817 case GOSTART: /* (?R) */
e7707071
YO
4818 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4819 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4820 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4821 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4822 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4823 Perl_croak(aTHX_
4824 "Pattern subroutine nesting without pos change"
4825 " exceeded limit in regex");
6bda09f9
YO
4826 } else {
4827 nochange_depth = 0;
1a147d38 4828 }
288b8c02 4829 re_sv = rex_sv;
6bda09f9 4830 re = rex;
f8fc2ecf 4831 rei = rexi;
1a147d38 4832 if (OP(scan)==GOSUB) {
6bda09f9
YO
4833 startpoint = scan + ARG2L(scan);
4834 ST.close_paren = ARG(scan);
4835 } else {
f8fc2ecf 4836 startpoint = rei->program+1;
6bda09f9
YO
4837 ST.close_paren = 0;
4838 }
4839 goto eval_recurse_doit;
118e2215 4840 assert(0); /* NOTREACHED */
3c0563b9 4841
6bda09f9
YO
4842 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4843 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4844 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4845 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4846 } else {
4847 nochange_depth = 0;
4848 }
8e5e9ebe 4849 {
4aabdb9b 4850 /* execute the code in the {...} */
81ed78b2 4851
4aabdb9b 4852 dSP;
a6dc34f1 4853 IV before;
1f4d1a1e 4854 OP * const oop = PL_op;
4aabdb9b 4855 COP * const ocurcop = PL_curcop;
81ed78b2 4856 OP *nop;
d80618d2 4857 char *saved_regeol = PL_regeol;
91332126 4858 struct re_save_state saved_state;
81ed78b2 4859 CV *newcv;
91332126 4860
74088413 4861 /* save *all* paren positions */
92da3157 4862 regcppush(rex, 0, maxopenparen);
74088413
DM
4863 REGCP_SET(runops_cp);
4864
6562f1c4 4865 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4866 * eval we would normally put it on the save stack, like with
4867 * save_re_context. However, re-evals have a weird scoping so we
4868 * can't just add ENTER/LEAVE here. With that, things like
4869 *
4870 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4871 *
4872 * would break, as they expect the localisation to be unwound
4873 * only when the re-engine backtracks through the bit that
4874 * localised it.
4875 *
4876 * What we do instead is just saving the state in a local c
4877 * variable.
4878 */
91332126 4879 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4880
81ed78b2
DM
4881 if (!caller_cv)
4882 caller_cv = find_runcv(NULL);
4883
4aabdb9b 4884 n = ARG(scan);
81ed78b2 4885
b30fcab9 4886 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 4887 newcv = (ReANY(
b30fcab9
DM
4888 (REGEXP*)(rexi->data->data[n])
4889 ))->qr_anoncv
81ed78b2
DM
4890 ;
4891 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4892 }
4893 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4894 newcv = caller_cv;
4895 nop = (OP*)rexi->data->data[n];
4896 assert(CvDEPTH(newcv));
68e2671b
DM
4897 }
4898 else {
d24ca0c5
DM
4899 /* literal with own CV */
4900 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4901 newcv = rex->qr_anoncv;
4902 nop = (OP*)rexi->data->data[n];
68e2671b 4903 }
81ed78b2 4904
0e458318
DM
4905 /* normally if we're about to execute code from the same
4906 * CV that we used previously, we just use the existing
4907 * CX stack entry. However, its possible that in the
4908 * meantime we may have backtracked, popped from the save
4909 * stack, and undone the SAVECOMPPAD(s) associated with
4910 * PUSH_MULTICALL; in which case PL_comppad no longer
4911 * points to newcv's pad. */
4912 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4913 {
4914 I32 depth = (newcv == caller_cv) ? 0 : 1;
4915 if (last_pushed_cv) {
4916 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4917 }
4918 else {
4919 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4920 }
4921 last_pushed_cv = newcv;
4922 }
c31ee3bb
DM
4923 else {
4924 /* these assignments are just to silence compiler
4925 * warnings */
4926 multicall_cop = NULL;
4927 newsp = NULL;
4928 }
0e458318
DM
4929 last_pad = PL_comppad;
4930
2e2e3f36
DM
4931 /* the initial nextstate you would normally execute
4932 * at the start of an eval (which would cause error
4933 * messages to come from the eval), may be optimised
4934 * away from the execution path in the regex code blocks;
4935 * so manually set PL_curcop to it initially */
4936 {
81ed78b2 4937 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4938 assert(o->op_type == OP_NULL);
4939 if (o->op_targ == OP_SCOPE) {
4940 o = cUNOPo->op_first;
4941 }
4942 else {
4943 assert(o->op_targ == OP_LEAVE);
4944 o = cUNOPo->op_first;
4945 assert(o->op_type == OP_ENTER);
4946 o = o->op_sibling;
4947 }
4948
4949 if (o->op_type != OP_STUB) {
4950 assert( o->op_type == OP_NEXTSTATE
4951 || o->op_type == OP_DBSTATE
4952 || (o->op_type == OP_NULL
4953 && ( o->op_targ == OP_NEXTSTATE
4954 || o->op_targ == OP_DBSTATE
4955 )
4956 )
4957 );
4958 PL_curcop = (COP*)o;
4959 }
4960 }
81ed78b2 4961 nop = nop->op_next;
2e2e3f36 4962
24b23f37 4963 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4964 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4965
b93070ed 4966 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4967
2bf803e2
YO
4968 if (sv_yes_mark) {
4969 SV *sv_mrk = get_sv("REGMARK", 1);
4970 sv_setsv(sv_mrk, sv_yes_mark);
4971 }
4972
81ed78b2
DM
4973 /* we don't use MULTICALL here as we want to call the
4974 * first op of the block of interest, rather than the
4975 * first op of the sub */
a6dc34f1 4976 before = (IV)(SP-PL_stack_base);
81ed78b2 4977 PL_op = nop;
8e5e9ebe
RGS
4978 CALLRUNOPS(aTHX); /* Scalar context. */
4979 SPAGAIN;
a6dc34f1 4980 if ((IV)(SP-PL_stack_base) == before)
075aa684 4981 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4982 else {
4983 ret = POPs;
4984 PUTBACK;
4985 }
4aabdb9b 4986
e4bfbed3
DM
4987 /* before restoring everything, evaluate the returned
4988 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4989 * PL_op or pad. Also need to process any magic vars
4990 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4991
4992 PL_op = NULL;
4993
5e98dac2 4994 re_sv = NULL;
e4bfbed3
DM
4995 if (logical == 0) /* (?{})/ */
4996 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4997 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4998 sw = cBOOL(SvTRUE(ret));
4999 logical = 0;
5000 }
5001 else { /* /(??{}) */
497d0a96
DM
5002 /* if its overloaded, let the regex compiler handle
5003 * it; otherwise extract regex, or stringify */
5004 if (!SvAMAGIC(ret)) {
5005 SV *sv = ret;
5006 if (SvROK(sv))
5007 sv = SvRV(sv);
5008 if (SvTYPE(sv) == SVt_REGEXP)
5009 re_sv = (REGEXP*) sv;
5010 else if (SvSMAGICAL(sv)) {
5011 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5012 if (mg)
5013 re_sv = (REGEXP *) mg->mg_obj;
5014 }
e4bfbed3 5015
497d0a96
DM
5016 /* force any magic, undef warnings here */
5017 if (!re_sv) {
5018 ret = sv_mortalcopy(ret);
5019 (void) SvPV_force_nolen(ret);
5020 }
e4bfbed3
DM
5021 }
5022
5023 }
5024
91332126
FR
5025 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5026
81ed78b2
DM
5027 /* *** Note that at this point we don't restore
5028 * PL_comppad, (or pop the CxSUB) on the assumption it may
5029 * be used again soon. This is safe as long as nothing
5030 * in the regexp code uses the pad ! */
4aabdb9b 5031 PL_op = oop;
4aabdb9b 5032 PL_curcop = ocurcop;
d80618d2 5033 PL_regeol = saved_regeol;
92da3157 5034 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
e4bfbed3
DM
5035
5036 if (logical != 2)
4aabdb9b 5037 break;
8e5e9ebe 5038 }
e4bfbed3
DM
5039
5040 /* only /(??{})/ from now on */
24d3c4a9 5041 logical = 0;
4aabdb9b 5042 {
4f639d21
DM
5043 /* extract RE object from returned value; compiling if
5044 * necessary */
5c35adbb 5045
575c37f6
DM
5046 if (re_sv) {
5047 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 5048 }
0f5d15d6 5049 else {
c737faaf 5050 U32 pm_flags = 0;
0f5d15d6 5051
9753d940
DM
5052 if (SvUTF8(ret) && IN_BYTES) {
5053 /* In use 'bytes': make a copy of the octet
5054 * sequence, but without the flag on */
b9ad30b4
NC
5055 STRLEN len;
5056 const char *const p = SvPV(ret, len);
5057 ret = newSVpvn_flags(p, len, SVs_TEMP);
5058 }
732caac7
DM
5059 if (rex->intflags & PREGf_USE_RE_EVAL)
5060 pm_flags |= PMf_USE_RE_EVAL;
5061
5062 /* if we got here, it should be an engine which
5063 * supports compiling code blocks and stuff */
5064 assert(rex->engine && rex->engine->op_comp);
ec841a27 5065 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 5066 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
5067 rex->engine, NULL, NULL,
5068 /* copy /msix etc to inner pattern */
5069 scan->flags,
5070 pm_flags);
732caac7 5071
9041c2e3 5072 if (!(SvFLAGS(ret)
faf82a0b 5073 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 5074 | SVs_GMG))) {
a2794585
NC
5075 /* This isn't a first class regexp. Instead, it's
5076 caching a regexp onto an existing, Perl visible
5077 scalar. */
575c37f6 5078 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 5079 }
74088413
DM
5080 /* safe to do now that any $1 etc has been
5081 * interpolated into the new pattern string and
5082 * compiled */
92da3157 5083 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
0f5d15d6 5084 }
e1ff3a88 5085 SAVEFREESV(re_sv);
8d919b0a 5086 re = ReANY(re_sv);
4aabdb9b 5087 }
07bc277f 5088 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
5089 re->subbeg = rex->subbeg;
5090 re->sublen = rex->sublen;
6502e081
DM
5091 re->suboffset = rex->suboffset;
5092 re->subcoffset = rex->subcoffset;
f8fc2ecf 5093 rei = RXi_GET(re);
6bda09f9 5094 DEBUG_EXECUTE_r(
f2ed9b32 5095 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
5096 "Matching embedded");
5097 );
f8fc2ecf 5098 startpoint = rei->program + 1;
1a147d38 5099 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 5100
1a147d38 5101 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 5102 /* run the pattern returned from (??{...}) */
92da3157
DM
5103
5104 /* Save *all* the positions. */
5105 ST.cp = regcppush(rex, 0, maxopenparen);
40a82448 5106 REGCP_SET(ST.lastcp);
6bda09f9 5107
0357f1fd
ML
5108 re->lastparen = 0;
5109 re->lastcloseparen = 0;
5110
92da3157 5111 maxopenparen = 0;
4aabdb9b
DM
5112
5113 /* XXXX This is too dramatic a measure... */
5114 PL_reg_maxiter = 0;
5115
984e6dd1
DM
5116 ST.saved_utf8_pat = is_utf8_pat;
5117 is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 5118
288b8c02 5119 ST.prev_rex = rex_sv;
faec1544 5120 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5121 rex_sv = re_sv;
5122 SET_reg_curpm(rex_sv);
288b8c02 5123 rex = re;
f8fc2ecf 5124 rexi = rei;
faec1544 5125 cur_curlyx = NULL;
40a82448 5126 ST.B = next;
faec1544
DM
5127 ST.prev_eval = cur_eval;
5128 cur_eval = st;
faec1544 5129 /* now continue from first node in postoned RE */
4d5016e5 5130 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 5131 assert(0); /* NOTREACHED */
c277df42 5132 }
40a82448 5133
faec1544
DM
5134 case EVAL_AB: /* cleanup after a successful (??{A})B */
5135 /* note: this is called twice; first after popping B, then A */
984e6dd1 5136 is_utf8_pat = ST.saved_utf8_pat;
ec43f78b
DM
5137 rex_sv = ST.prev_rex;
5138 SET_reg_curpm(rex_sv);
8d919b0a 5139 rex = ReANY(rex_sv);
f8fc2ecf 5140 rexi = RXi_GET(rex);
faec1544
DM
5141 regcpblow(ST.cp);
5142 cur_eval = ST.prev_eval;
5143 cur_curlyx = ST.prev_curlyx;
34a81e2b 5144
40a82448
DM
5145 /* XXXX This is too dramatic a measure... */
5146 PL_reg_maxiter = 0;
e7707071 5147 if ( nochange_depth )
4b196cd4 5148 nochange_depth--;
262b90c4 5149 sayYES;
40a82448 5150
40a82448 5151
faec1544
DM
5152 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5153 /* note: this is called twice; first after popping B, then A */
984e6dd1 5154 is_utf8_pat = ST.saved_utf8_pat;
ec43f78b
DM
5155 rex_sv = ST.prev_rex;
5156 SET_reg_curpm(rex_sv);
8d919b0a 5157 rex = ReANY(rex_sv);
f8fc2ecf 5158 rexi = RXi_GET(rex);
0357f1fd 5159
40a82448 5160 REGCP_UNWIND(ST.lastcp);
92da3157 5161 regcppop(rex, &maxopenparen);
faec1544
DM
5162 cur_eval = ST.prev_eval;
5163 cur_curlyx = ST.prev_curlyx;
5164 /* XXXX This is too dramatic a measure... */
5165 PL_reg_maxiter = 0;
e7707071 5166 if ( nochange_depth )
4b196cd4 5167 nochange_depth--;
40a82448 5168 sayNO_SILENT;
40a82448
DM
5169#undef ST
5170
3c0563b9 5171 case OPEN: /* ( */
c277df42 5172 n = ARG(scan); /* which paren pair */
1ca2007e 5173 rex->offs[n].start_tmp = locinput - PL_bostr;
92da3157
DM
5174 if (n > maxopenparen)
5175 maxopenparen = n;
495f47a5 5176 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 5177 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
5178 PTR2UV(rex),
5179 PTR2UV(rex->offs),
5180 (UV)n,
5181 (IV)rex->offs[n].start_tmp,
92da3157 5182 (UV)maxopenparen
495f47a5 5183 ));
e2e6a0f1 5184 lastopen = n;
a0d0e21e 5185 break;
495f47a5
DM
5186
5187/* XXX really need to log other places start/end are set too */
5188#define CLOSE_CAPTURE \
5189 rex->offs[n].start = rex->offs[n].start_tmp; \
5190 rex->offs[n].end = locinput - PL_bostr; \
5191 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5192 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5193 PTR2UV(rex), \
5194 PTR2UV(rex->offs), \
5195 (UV)n, \
5196 (IV)rex->offs[n].start, \
5197 (IV)rex->offs[n].end \
5198 ))
5199
3c0563b9 5200 case CLOSE: /* ) */
c277df42 5201 n = ARG(scan); /* which paren pair */
495f47a5 5202 CLOSE_CAPTURE;
b93070ed
DM
5203 if (n > rex->lastparen)
5204 rex->lastparen = n;
5205 rex->lastcloseparen = n;
3b6647e0 5206 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5207 goto fake_end;
5208 }
a0d0e21e 5209 break;
3c0563b9
DM
5210
5211 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5212 if (ARG(scan)){
5213 regnode *cursor;
5214 for (cursor=scan;
5215 cursor && OP(cursor)!=END;
5216 cursor=regnext(cursor))
5217 {
5218 if ( OP(cursor)==CLOSE ){
5219 n = ARG(cursor);
5220 if ( n <= lastopen ) {
495f47a5 5221 CLOSE_CAPTURE;
b93070ed
DM
5222 if (n > rex->lastparen)
5223 rex->lastparen = n;
5224 rex->lastcloseparen = n;
3b6647e0
RB
5225 if ( n == ARG(scan) || (cur_eval &&
5226 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5227 break;
5228 }
5229 }
5230 }
5231 }
5232 goto fake_end;
5233 /*NOTREACHED*/
3c0563b9
DM
5234
5235 case GROUPP: /* (?(1)) */
c277df42 5236 n = ARG(scan); /* which paren pair */
b93070ed 5237 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5238 break;
3c0563b9
DM
5239
5240 case NGROUPP: /* (?(<name>)) */
0a4db386 5241 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5242 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5243 break;
3c0563b9
DM
5244
5245 case INSUBP: /* (?(R)) */
0a4db386 5246 n = ARG(scan);
3b6647e0 5247 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5248 break;
3c0563b9
DM
5249
5250 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5251 sw = 0;
5252 break;
3c0563b9
DM
5253
5254 case IFTHEN: /* (?(cond)A|B) */
2c2d71f5 5255 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 5256 if (sw)
c277df42
IZ
5257 next = NEXTOPER(NEXTOPER(scan));
5258 else {
5259 next = scan + ARG(scan);
5260 if (OP(next) == IFTHEN) /* Fake one. */
5261 next = NEXTOPER(NEXTOPER(next));
5262 }
5263 break;
3c0563b9
DM
5264
5265 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5266 logical = scan->flags;
c277df42 5267 break;
c476f425 5268
2ab05381 5269/*******************************************************************
2ab05381 5270
c476f425
DM
5271The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5272pattern, where A and B are subpatterns. (For simple A, CURLYM or
5273STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5274
c476f425 5275A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5276
c476f425
DM
5277On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5278state, which contains the current count, initialised to -1. It also sets
5279cur_curlyx to point to this state, with any previous value saved in the
5280state block.
2ab05381 5281
c476f425
DM
5282CURLYX then jumps straight to the WHILEM op, rather than executing A,
5283since the pattern may possibly match zero times (i.e. it's a while {} loop
5284rather than a do {} while loop).
2ab05381 5285
c476f425
DM
5286Each entry to WHILEM represents a successful match of A. The count in the
5287CURLYX block is incremented, another WHILEM state is pushed, and execution
5288passes to A or B depending on greediness and the current count.
2ab05381 5289
c476f425
DM
5290For example, if matching against the string a1a2a3b (where the aN are
5291substrings that match /A/), then the match progresses as follows: (the
5292pushed states are interspersed with the bits of strings matched so far):
2ab05381 5293
c476f425
DM
5294 <CURLYX cnt=-1>
5295 <CURLYX cnt=0><WHILEM>
5296 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5297 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5298 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5299 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5300
c476f425
DM
5301(Contrast this with something like CURLYM, which maintains only a single
5302backtrack state:
2ab05381 5303
c476f425
DM
5304 <CURLYM cnt=0> a1
5305 a1 <CURLYM cnt=1> a2
5306 a1 a2 <CURLYM cnt=2> a3
5307 a1 a2 a3 <CURLYM cnt=3> b
5308)
2ab05381 5309
c476f425
DM
5310Each WHILEM state block marks a point to backtrack to upon partial failure
5311of A or B, and also contains some minor state data related to that
5312iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5313overall state, such as the count, and pointers to the A and B ops.
2ab05381 5314
c476f425
DM
5315This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5316must always point to the *current* CURLYX block, the rules are:
2ab05381 5317
c476f425
DM
5318When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5319and set cur_curlyx to point the new block.
2ab05381 5320
c476f425
DM
5321When popping the CURLYX block after a successful or unsuccessful match,
5322restore the previous cur_curlyx.
2ab05381 5323
c476f425
DM
5324When WHILEM is about to execute B, save the current cur_curlyx, and set it
5325to the outer one saved in the CURLYX block.
2ab05381 5326
c476f425
DM
5327When popping the WHILEM block after a successful or unsuccessful B match,
5328restore the previous cur_curlyx.
2ab05381 5329
c476f425
DM
5330Here's an example for the pattern (AI* BI)*BO
5331I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5332
c476f425
DM
5333cur_
5334curlyx backtrack stack
5335------ ---------------
5336NULL
5337CO <CO prev=NULL> <WO>
5338CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5339CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5340NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5341
c476f425
DM
5342At this point the pattern succeeds, and we work back down the stack to
5343clean up, restoring as we go:
95b24440 5344
c476f425
DM
5345CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5346CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5347CO <CO prev=NULL> <WO>
5348NULL
a0374537 5349
c476f425
DM
5350*******************************************************************/
5351
5352#define ST st->u.curlyx
5353
5354 case CURLYX: /* start of /A*B/ (for complex A) */
5355 {
5356 /* No need to save/restore up to this paren */
5357 I32 parenfloor = scan->flags;
5358
5359 assert(next); /* keep Coverity happy */
5360 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5361 next += ARG(next);
5362
5363 /* XXXX Probably it is better to teach regpush to support
92da3157 5364 parenfloor > maxopenparen ... */
b93070ed
DM
5365 if (parenfloor > (I32)rex->lastparen)
5366 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5367
5368 ST.prev_curlyx= cur_curlyx;
5369 cur_curlyx = st;
5370 ST.cp = PL_savestack_ix;
5371
5372 /* these fields contain the state of the current curly.
5373 * they are accessed by subsequent WHILEMs */
5374 ST.parenfloor = parenfloor;
d02d6d97 5375 ST.me = scan;
c476f425 5376 ST.B = next;
24d3c4a9
DM
5377 ST.minmod = minmod;
5378 minmod = 0;
c476f425
DM
5379 ST.count = -1; /* this will be updated by WHILEM */
5380 ST.lastloc = NULL; /* this will be updated by WHILEM */
5381
4d5016e5 5382 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5383 assert(0); /* NOTREACHED */
c476f425 5384 }
a0d0e21e 5385
c476f425 5386 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5387 cur_curlyx = ST.prev_curlyx;
5388 sayYES;
118e2215 5389 assert(0); /* NOTREACHED */
a0d0e21e 5390
c476f425
DM
5391 case CURLYX_end_fail: /* just failed to match all of A*B */
5392 regcpblow(ST.cp);
5393 cur_curlyx = ST.prev_curlyx;
5394 sayNO;
118e2215 5395 assert(0); /* NOTREACHED */
4633a7c4 5396
a0d0e21e 5397
c476f425
DM
5398#undef ST
5399#define ST st->u.whilem
5400
5401 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5402 {
5403 /* see the discussion above about CURLYX/WHILEM */
c476f425 5404 I32 n;
d02d6d97
DM
5405 int min = ARG1(cur_curlyx->u.curlyx.me);
5406 int max = ARG2(cur_curlyx->u.curlyx.me);
5407 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5408
c476f425
DM
5409 assert(cur_curlyx); /* keep Coverity happy */
5410 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5411 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5412 ST.cache_offset = 0;
5413 ST.cache_mask = 0;
5414
c476f425
DM
5415
5416 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5417 "%*s whilem: matched %ld out of %d..%d\n",
5418 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5419 );
a0d0e21e 5420
c476f425 5421 /* First just match a string of min A's. */
a0d0e21e 5422
d02d6d97 5423 if (n < min) {
92da3157
DM
5424 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5425 maxopenparen);
c476f425 5426 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5427 REGCP_SET(ST.lastcp);
5428
4d5016e5 5429 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5430 assert(0); /* NOTREACHED */
c476f425
DM
5431 }
5432
5433 /* If degenerate A matches "", assume A done. */
5434
5435 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5436 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5437 "%*s whilem: empty match detected, trying continuation...\n",
5438 REPORT_CODE_OFF+depth*2, "")
5439 );
5440 goto do_whilem_B_max;
5441 }
5442
5443 /* super-linear cache processing */
5444
5445 if (scan->flags) {
a0d0e21e 5446
2c2d71f5 5447 if (!PL_reg_maxiter) {
c476f425
DM
5448 /* start the countdown: Postpone detection until we
5449 * know the match is not *that* much linear. */
2c2d71f5 5450 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
5451 /* possible overflow for long strings and many CURLYX's */
5452 if (PL_reg_maxiter < 0)
5453 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
5454 PL_reg_leftiter = PL_reg_maxiter;
5455 }
c476f425 5456
2c2d71f5 5457 if (PL_reg_leftiter-- == 0) {
c476f425 5458 /* initialise cache */
3298f257 5459 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 5460 if (PL_reg_poscache) {
eb160463 5461 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
5462 Renew(PL_reg_poscache, size, char);
5463 PL_reg_poscache_size = size;
5464 }
5465 Zero(PL_reg_poscache, size, char);
5466 }
5467 else {
5468 PL_reg_poscache_size = size;
a02a5408 5469 Newxz(PL_reg_poscache, size, char);
2c2d71f5 5470 }
c476f425
DM
5471 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5472 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5473 PL_colors[4], PL_colors[5])
5474 );
2c2d71f5 5475 }
c476f425 5476
2c2d71f5 5477 if (PL_reg_leftiter < 0) {
c476f425
DM
5478 /* have we already failed at this position? */
5479 I32 offset, mask;
5480 offset = (scan->flags & 0xf) - 1
5481 + (locinput - PL_bostr) * (scan->flags>>4);
5482 mask = 1 << (offset % 8);
5483 offset /= 8;
5484 if (PL_reg_poscache[offset] & mask) {
5485 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5486 "%*s whilem: (cache) already tried at this position...\n",
5487 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5488 );
3298f257 5489 sayNO; /* cache records failure */
2c2d71f5 5490 }
c476f425
DM
5491 ST.cache_offset = offset;
5492 ST.cache_mask = mask;
2c2d71f5 5493 }
c476f425 5494 }
2c2d71f5 5495
c476f425 5496 /* Prefer B over A for minimal matching. */
a687059c 5497
c476f425
DM
5498 if (cur_curlyx->u.curlyx.minmod) {
5499 ST.save_curlyx = cur_curlyx;
5500 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
5501 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5502 maxopenparen);
c476f425 5503 REGCP_SET(ST.lastcp);
4d5016e5
DM
5504 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5505 locinput);
118e2215 5506 assert(0); /* NOTREACHED */
c476f425 5507 }
a0d0e21e 5508
c476f425
DM
5509 /* Prefer A over B for maximal matching. */
5510
d02d6d97 5511 if (n < max) { /* More greed allowed? */
92da3157
DM
5512 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5513 maxopenparen);
c476f425
DM
5514 cur_curlyx->u.curlyx.lastloc = locinput;
5515 REGCP_SET(ST.lastcp);
4d5016e5 5516 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5517 assert(0); /* NOTREACHED */
c476f425
DM
5518 }
5519 goto do_whilem_B_max;
5520 }
118e2215 5521 assert(0); /* NOTREACHED */
c476f425
DM
5522
5523 case WHILEM_B_min: /* just matched B in a minimal match */
5524 case WHILEM_B_max: /* just matched B in a maximal match */
5525 cur_curlyx = ST.save_curlyx;
5526 sayYES;
118e2215 5527 assert(0); /* NOTREACHED */
c476f425
DM
5528
5529 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5530 cur_curlyx = ST.save_curlyx;
5531 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5532 cur_curlyx->u.curlyx.count--;
5533 CACHEsayNO;
118e2215 5534 assert(0); /* NOTREACHED */
c476f425
DM
5535
5536 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5537 /* FALL THROUGH */
5538 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 5539 REGCP_UNWIND(ST.lastcp);
92da3157 5540 regcppop(rex, &maxopenparen);
c476f425
DM
5541 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5542 cur_curlyx->u.curlyx.count--;
5543 CACHEsayNO;
118e2215 5544 assert(0); /* NOTREACHED */
c476f425
DM
5545
5546 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5547 REGCP_UNWIND(ST.lastcp);
92da3157 5548 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
5549 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5550 "%*s whilem: failed, trying continuation...\n",
5551 REPORT_CODE_OFF+depth*2, "")
5552 );
5553 do_whilem_B_max:
5554 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5555 && ckWARN(WARN_REGEXP)
39819bd9 5556 && !reginfo->warned)
c476f425 5557 {
39819bd9 5558 reginfo->warned = TRUE;
dcbac5bb
FC
5559 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5560 "Complex regular subexpression recursion limit (%d) "
5561 "exceeded",
c476f425
DM
5562 REG_INFTY - 1);
5563 }
5564
5565 /* now try B */
5566 ST.save_curlyx = cur_curlyx;
5567 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5568 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5569 locinput);
118e2215 5570 assert(0); /* NOTREACHED */
c476f425
DM
5571
5572 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5573 cur_curlyx = ST.save_curlyx;
5574 REGCP_UNWIND(ST.lastcp);
92da3157 5575 regcppop(rex, &maxopenparen);
c476f425 5576
d02d6d97 5577 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5578 /* Maximum greed exceeded */
5579 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5580 && ckWARN(WARN_REGEXP)
39819bd9 5581 && !reginfo->warned)
c476f425 5582 {
39819bd9 5583 reginfo->warned = TRUE;
c476f425 5584 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5585 "Complex regular subexpression recursion "
5586 "limit (%d) exceeded",
c476f425 5587 REG_INFTY - 1);
a0d0e21e 5588 }
c476f425 5589 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5590 CACHEsayNO;
a0d0e21e 5591 }
c476f425
DM
5592
5593 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5594 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5595 );
5596 /* Try grabbing another A and see if it helps. */
c476f425 5597 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
5598 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5599 maxopenparen);
c476f425 5600 REGCP_SET(ST.lastcp);
d02d6d97 5601 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5602 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5603 locinput);
118e2215 5604 assert(0); /* NOTREACHED */
40a82448
DM
5605
5606#undef ST
5607#define ST st->u.branch
5608
5609 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5610 next = scan + ARG(scan);
5611 if (next == scan)
5612 next = NULL;
40a82448
DM
5613 scan = NEXTOPER(scan);
5614 /* FALL THROUGH */
c277df42 5615
40a82448
DM
5616 case BRANCH: /* /(...|A|...)/ */
5617 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5618 ST.lastparen = rex->lastparen;
f6033a9d 5619 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5620 ST.next_branch = next;
5621 REGCP_SET(ST.cp);
02db2b7b 5622
40a82448 5623 /* Now go into the branch */
5d458dd8 5624 if (has_cutgroup) {
4d5016e5 5625 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5626 } else {
4d5016e5 5627 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5628 }
118e2215 5629 assert(0); /* NOTREACHED */
3c0563b9
DM
5630
5631 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 5632 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5633 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5634 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5635 assert(0); /* NOTREACHED */
3c0563b9 5636
5d458dd8
YO
5637 case CUTGROUP_next_fail:
5638 do_cutgroup = 1;
5639 no_final = 1;
5640 if (st->u.mark.mark_name)
5641 sv_commit = st->u.mark.mark_name;
5642 sayNO;
118e2215 5643 assert(0); /* NOTREACHED */
3c0563b9 5644
5d458dd8
YO
5645 case BRANCH_next:
5646 sayYES;
118e2215 5647 assert(0); /* NOTREACHED */
3c0563b9 5648
40a82448 5649 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5650 if (do_cutgroup) {
5651 do_cutgroup = 0;
5652 no_final = 0;
5653 }
40a82448 5654 REGCP_UNWIND(ST.cp);
a8d1f4b4 5655 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5656 scan = ST.next_branch;
5657 /* no more branches? */
5d458dd8
YO
5658 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5659 DEBUG_EXECUTE_r({
5660 PerlIO_printf( Perl_debug_log,
5661 "%*s %sBRANCH failed...%s\n",
5662 REPORT_CODE_OFF+depth*2, "",
5663 PL_colors[4],
5664 PL_colors[5] );
5665 });
5666 sayNO_SILENT;
5667 }
40a82448 5668 continue; /* execute next BRANCH[J] op */
118e2215 5669 assert(0); /* NOTREACHED */
40a82448 5670
3c0563b9 5671 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 5672 minmod = 1;
a0d0e21e 5673 break;
40a82448
DM
5674
5675#undef ST
5676#define ST st->u.curlym
5677
5678 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5679
5680 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5681 * only a single backtracking state, no matter how many matches
40a82448
DM
5682 * there are in {m,n}. It relies on the pattern being constant
5683 * length, with no parens to influence future backrefs
5684 */
5685
5686 ST.me = scan;
dc45a647 5687 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5688
f6033a9d
DM
5689 ST.lastparen = rex->lastparen;
5690 ST.lastcloseparen = rex->lastcloseparen;
5691
40a82448
DM
5692 /* if paren positive, emulate an OPEN/CLOSE around A */
5693 if (ST.me->flags) {
3b6647e0 5694 U32 paren = ST.me->flags;
92da3157
DM
5695 if (paren > maxopenparen)
5696 maxopenparen = paren;
c277df42 5697 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5698 }
40a82448
DM
5699 ST.A = scan;
5700 ST.B = next;
5701 ST.alen = 0;
5702 ST.count = 0;
24d3c4a9
DM
5703 ST.minmod = minmod;
5704 minmod = 0;
40a82448
DM
5705 ST.c1 = CHRTEST_UNINIT;
5706 REGCP_SET(ST.cp);
6407bf3b 5707
40a82448
DM
5708 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5709 goto curlym_do_B;
5710
5711 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5712 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5713 assert(0); /* NOTREACHED */
5f80c4cf 5714
40a82448 5715 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5716 ST.count++;
5717 /* after first match, determine A's length: u.curlym.alen */
5718 if (ST.count == 1) {
5719 if (PL_reg_match_utf8) {
c07e9d7b
DM
5720 char *s = st->locinput;
5721 while (s < locinput) {
40a82448
DM
5722 ST.alen++;
5723 s += UTF8SKIP(s);
5724 }
5725 }
5726 else {
c07e9d7b 5727 ST.alen = locinput - st->locinput;
40a82448
DM
5728 }
5729 if (ST.alen == 0)
5730 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5731 }
0cadcf80
DM
5732 DEBUG_EXECUTE_r(
5733 PerlIO_printf(Perl_debug_log,
40a82448 5734 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5735 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5736 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5737 );
5738
0a4db386 5739 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5740 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5741 goto fake_end;
5742
c966426a
DM
5743 {
5744 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5745 if ( max == REG_INFTY || ST.count < max )
5746 goto curlym_do_A; /* try to match another A */
5747 }
40a82448 5748 goto curlym_do_B; /* try to match B */
5f80c4cf 5749
40a82448
DM
5750 case CURLYM_A_fail: /* just failed to match an A */
5751 REGCP_UNWIND(ST.cp);
0a4db386
YO
5752
5753 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5754 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5755 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5756 sayNO;
0cadcf80 5757
40a82448 5758 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5759 if (ST.c1 == CHRTEST_UNINIT) {
5760 /* calculate c1 and c2 for possible match of 1st char
5761 * following curly */
5762 ST.c1 = ST.c2 = CHRTEST_VOID;
5763 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5764 regnode *text_node = ST.B;
5765 if (! HAS_TEXT(text_node))
5766 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5767 /* this used to be
5768
5769 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5770
5771 But the former is redundant in light of the latter.
5772
5773 if this changes back then the macro for
5774 IS_TEXT and friends need to change.
5775 */
c74f6de9 5776 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 5777 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1
DM
5778 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5779 is_utf8_pat))
c74f6de9
KW
5780 {
5781 sayNO;
5782 }
c277df42 5783 }
c277df42 5784 }
40a82448
DM
5785 }
5786
5787 DEBUG_EXECUTE_r(
5788 PerlIO_printf(Perl_debug_log,
5789 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5790 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5791 "", (IV)ST.count)
5792 );
c74f6de9 5793 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
5794 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5795 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5796 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5797 {
5798 /* simulate B failing */
5799 DEBUG_OPTIMISE_r(
5800 PerlIO_printf(Perl_debug_log,
5801 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5802 (int)(REPORT_CODE_OFF+(depth*2)),"",
5803 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5804 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5805 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5806 );
5807 state_num = CURLYM_B_fail;
5808 goto reenter_switch;
5809 }
5810 }
5811 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
5812 /* simulate B failing */
5813 DEBUG_OPTIMISE_r(
5814 PerlIO_printf(Perl_debug_log,
79a2a0e8 5815 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5400f398 5816 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
5817 (int) nextchr, ST.c1, ST.c2)
5818 );
5400f398
KW
5819 state_num = CURLYM_B_fail;
5820 goto reenter_switch;
5821 }
c74f6de9 5822 }
40a82448
DM
5823
5824 if (ST.me->flags) {
f6033a9d 5825 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5826 I32 paren = ST.me->flags;
5827 if (ST.count) {
b93070ed 5828 rex->offs[paren].start
c07e9d7b
DM
5829 = HOPc(locinput, -ST.alen) - PL_bostr;
5830 rex->offs[paren].end = locinput - PL_bostr;
f6033a9d
DM
5831 if ((U32)paren > rex->lastparen)
5832 rex->lastparen = paren;
5833 rex->lastcloseparen = paren;
c277df42 5834 }
40a82448 5835 else
b93070ed 5836 rex->offs[paren].end = -1;
0a4db386 5837 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5838 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5839 {
5840 if (ST.count)
5841 goto fake_end;
5842 else
5843 sayNO;
5844 }
c277df42 5845 }
0a4db386 5846
4d5016e5 5847 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 5848 assert(0); /* NOTREACHED */
40a82448
DM
5849
5850 case CURLYM_B_fail: /* just failed to match a B */
5851 REGCP_UNWIND(ST.cp);
a8d1f4b4 5852 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5853 if (ST.minmod) {
84d2fa14
HS
5854 I32 max = ARG2(ST.me);
5855 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5856 sayNO;
5857 goto curlym_do_A; /* try to match a further A */
5858 }
5859 /* backtrack one A */
5860 if (ST.count == ARG1(ST.me) /* min */)
5861 sayNO;
5862 ST.count--;
7016d6eb 5863 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
5864 goto curlym_do_B; /* try to match B */
5865
c255a977
DM
5866#undef ST
5867#define ST st->u.curly
40a82448 5868
c255a977
DM
5869#define CURLY_SETPAREN(paren, success) \
5870 if (paren) { \
5871 if (success) { \
b93070ed
DM
5872 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5873 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5874 if (paren > rex->lastparen) \
5875 rex->lastparen = paren; \
b93070ed 5876 rex->lastcloseparen = paren; \
c255a977 5877 } \
f6033a9d 5878 else { \
b93070ed 5879 rex->offs[paren].end = -1; \
f6033a9d
DM
5880 rex->lastparen = ST.lastparen; \
5881 rex->lastcloseparen = ST.lastcloseparen; \
5882 } \
c255a977
DM
5883 }
5884
b40a2c17 5885 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
5886 ST.paren = 0;
5887 ST.min = 0;
5888 ST.max = REG_INFTY;
a0d0e21e
LW
5889 scan = NEXTOPER(scan);
5890 goto repeat;
3c0563b9 5891
b40a2c17 5892 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
5893 ST.paren = 0;
5894 ST.min = 1;
5895 ST.max = REG_INFTY;
c277df42 5896 scan = NEXTOPER(scan);
c255a977 5897 goto repeat;
3c0563b9 5898
b40a2c17 5899 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
5900 ST.paren = scan->flags; /* Which paren to set */
5901 ST.lastparen = rex->lastparen;
f6033a9d 5902 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
5903 if (ST.paren > maxopenparen)
5904 maxopenparen = ST.paren;
c255a977
DM
5905 ST.min = ARG1(scan); /* min to match */
5906 ST.max = ARG2(scan); /* max to match */
0a4db386 5907 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5908 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5909 ST.min=1;
5910 ST.max=1;
5911 }
c255a977
DM
5912 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5913 goto repeat;
3c0563b9 5914
b40a2c17 5915 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
5916 ST.paren = 0;
5917 ST.min = ARG1(scan); /* min to match */
5918 ST.max = ARG2(scan); /* max to match */
5919 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5920 repeat:
a0d0e21e
LW
5921 /*
5922 * Lookahead to avoid useless match attempts
5923 * when we know what character comes next.
c255a977 5924 *
5f80c4cf
JP
5925 * Used to only do .*x and .*?x, but now it allows
5926 * for )'s, ('s and (?{ ... })'s to be in the way
5927 * of the quantifier and the EXACT-like node. -- japhy
5928 */
5929
eb5c1be8 5930 assert(ST.min <= ST.max);
3337dfe3
KW
5931 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5932 ST.c1 = ST.c2 = CHRTEST_VOID;
5933 }
5934 else {
5f80c4cf
JP
5935 regnode *text_node = next;
5936
3dab1dad
YO
5937 if (! HAS_TEXT(text_node))
5938 FIND_NEXT_IMPT(text_node);
5f80c4cf 5939
9e137952 5940 if (! HAS_TEXT(text_node))
c255a977 5941 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5942 else {
ee9b8eae 5943 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5944 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5945 }
c74f6de9 5946 else {
ee9b8eae
YO
5947
5948 /* Currently we only get here when
5949
5950 PL_rekind[OP(text_node)] == EXACT
5951
5952 if this changes back then the macro for IS_TEXT and
5953 friends need to change. */
79a2a0e8 5954 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1
DM
5955 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5956 is_utf8_pat))
c74f6de9
KW
5957 {
5958 sayNO;
5959 }
5960 }
1aa99e6b 5961 }
bbce6d69 5962 }
c255a977
DM
5963
5964 ST.A = scan;
5965 ST.B = next;
24d3c4a9 5966 if (minmod) {
eb72505d 5967 char *li = locinput;
24d3c4a9 5968 minmod = 0;
984e6dd1
DM
5969 if (ST.min &&
5970 regrepeat(rex, &li, ST.A, ST.min, depth, is_utf8_pat)
5971 < ST.min)
4633a7c4 5972 sayNO;
7016d6eb 5973 SET_locinput(li);
c255a977 5974 ST.count = ST.min;
c255a977
DM
5975 REGCP_SET(ST.cp);
5976 if (ST.c1 == CHRTEST_VOID)
5977 goto curly_try_B_min;
5978
5979 ST.oldloc = locinput;
5980
5981 /* set ST.maxpos to the furthest point along the
5982 * string that could possibly match */
5983 if (ST.max == REG_INFTY) {
5984 ST.maxpos = PL_regeol - 1;
f2ed9b32 5985 if (utf8_target)
c255a977
DM
5986 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5987 ST.maxpos--;
5988 }
f2ed9b32 5989 else if (utf8_target) {
c255a977
DM
5990 int m = ST.max - ST.min;
5991 for (ST.maxpos = locinput;
9a902117 5992 m >0 && ST.maxpos < PL_regeol; m--)
c255a977
DM
5993 ST.maxpos += UTF8SKIP(ST.maxpos);
5994 }
5995 else {
5996 ST.maxpos = locinput + ST.max - ST.min;
5997 if (ST.maxpos >= PL_regeol)
5998 ST.maxpos = PL_regeol - 1;
5999 }
6000 goto curly_try_B_min_known;
6001
6002 }
6003 else {
eb72505d
DM
6004 /* avoid taking address of locinput, so it can remain
6005 * a register var */
6006 char *li = locinput;
984e6dd1
DM
6007 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth,
6008 is_utf8_pat);
c255a977
DM
6009 if (ST.count < ST.min)
6010 sayNO;
7016d6eb 6011 SET_locinput(li);
c255a977
DM
6012 if ((ST.count > ST.min)
6013 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6014 {
6015 /* A{m,n} must come at the end of the string, there's
6016 * no point in backing off ... */
6017 ST.min = ST.count;
6018 /* ...except that $ and \Z can match before *and* after
6019 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6020 We may back off by one in this case. */
eb72505d 6021 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
6022 ST.min--;
6023 }
6024 REGCP_SET(ST.cp);
6025 goto curly_try_B_max;
6026 }
118e2215 6027 assert(0); /* NOTREACHED */
c255a977
DM
6028
6029
6030 case CURLY_B_min_known_fail:
6031 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 6032
c255a977 6033 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6034 if (ST.paren) {
6035 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6036 }
c255a977
DM
6037 /* Couldn't or didn't -- move forward. */
6038 ST.oldloc = locinput;
f2ed9b32 6039 if (utf8_target)
c255a977
DM
6040 locinput += UTF8SKIP(locinput);
6041 else
6042 locinput++;
6043 ST.count++;
6044 curly_try_B_min_known:
6045 /* find the next place where 'B' could work, then call B */
6046 {
6047 int n;
f2ed9b32 6048 if (utf8_target) {
c255a977
DM
6049 n = (ST.oldloc == locinput) ? 0 : 1;
6050 if (ST.c1 == ST.c2) {
c255a977 6051 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6052 while (locinput <= ST.maxpos
6053 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6054 {
6055 locinput += UTF8SKIP(locinput);
c255a977
DM
6056 n++;
6057 }
1aa99e6b
IH
6058 }
6059 else {
c255a977 6060 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6061 while (locinput <= ST.maxpos
6062 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6063 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6064 {
6065 locinput += UTF8SKIP(locinput);
c255a977 6066 n++;
1aa99e6b 6067 }
0fe9bf95
IZ
6068 }
6069 }
5400f398 6070 else { /* Not utf8_target */
c255a977
DM
6071 if (ST.c1 == ST.c2) {
6072 while (locinput <= ST.maxpos &&
6073 UCHARAT(locinput) != ST.c1)
6074 locinput++;
bbce6d69 6075 }
c255a977
DM
6076 else {
6077 while (locinput <= ST.maxpos
6078 && UCHARAT(locinput) != ST.c1
6079 && UCHARAT(locinput) != ST.c2)
6080 locinput++;
a0ed51b3 6081 }
c255a977
DM
6082 n = locinput - ST.oldloc;
6083 }
6084 if (locinput > ST.maxpos)
6085 sayNO;
c255a977 6086 if (n) {
eb72505d
DM
6087 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6088 * at b; check that everything between oldloc and
6089 * locinput matches */
6090 char *li = ST.oldloc;
c255a977 6091 ST.count += n;
984e6dd1 6092 if (regrepeat(rex, &li, ST.A, n, depth, is_utf8_pat) < n)
4633a7c4 6093 sayNO;
eb72505d 6094 assert(n == REG_INFTY || locinput == li);
a0d0e21e 6095 }
c255a977 6096 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6097 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6098 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6099 goto fake_end;
6100 }
4d5016e5 6101 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 6102 }
118e2215 6103 assert(0); /* NOTREACHED */
c255a977
DM
6104
6105
6106 case CURLY_B_min_fail:
6107 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
6108
6109 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6110 if (ST.paren) {
6111 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6112 }
c255a977 6113 /* failed -- move forward one */
f73aaa43 6114 {
eb72505d 6115 char *li = locinput;
984e6dd1 6116 if (!regrepeat(rex, &li, ST.A, 1, depth, is_utf8_pat)) {
f73aaa43
DM
6117 sayNO;
6118 }
eb72505d 6119 locinput = li;
f73aaa43
DM
6120 }
6121 {
c255a977 6122 ST.count++;
c255a977
DM
6123 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6124 ST.count > 0)) /* count overflow ? */
15272685 6125 {
c255a977
DM
6126 curly_try_B_min:
6127 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6128 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6129 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6130 goto fake_end;
6131 }
4d5016e5 6132 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6133 }
6134 }
c74f6de9 6135 sayNO;
118e2215 6136 assert(0); /* NOTREACHED */
c255a977
DM
6137
6138
6139 curly_try_B_max:
6140 /* a successful greedy match: now try to match B */
40d049e4 6141 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6142 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6143 goto fake_end;
6144 }
c255a977 6145 {
79a2a0e8
KW
6146 bool could_match = locinput < PL_regeol;
6147
c255a977 6148 /* If it could work, try it. */
79a2a0e8
KW
6149 if (ST.c1 != CHRTEST_VOID && could_match) {
6150 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6151 {
6152 could_match = memEQ(locinput,
6153 ST.c1_utf8,
6154 UTF8SKIP(locinput))
6155 || memEQ(locinput,
6156 ST.c2_utf8,
6157 UTF8SKIP(locinput));
6158 }
6159 else {
6160 could_match = UCHARAT(locinput) == ST.c1
6161 || UCHARAT(locinput) == ST.c2;
6162 }
6163 }
6164 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6165 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6166 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 6167 assert(0); /* NOTREACHED */
c255a977
DM
6168 }
6169 }
6170 /* FALL THROUGH */
3c0563b9 6171
c255a977
DM
6172 case CURLY_B_max_fail:
6173 /* failed to find B in a greedy match */
c255a977
DM
6174
6175 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6176 if (ST.paren) {
6177 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6178 }
c255a977
DM
6179 /* back up. */
6180 if (--ST.count < ST.min)
6181 sayNO;
eb72505d 6182 locinput = HOPc(locinput, -1);
c255a977
DM
6183 goto curly_try_B_max;
6184
6185#undef ST
6186
3c0563b9 6187 case END: /* last op of main pattern */
6bda09f9 6188 fake_end:
faec1544
DM
6189 if (cur_eval) {
6190 /* we've just finished A in /(??{A})B/; now continue with B */
984e6dd1
DM
6191 st->u.eval.saved_utf8_pat = is_utf8_pat;
6192 is_utf8_pat = cur_eval->u.eval.saved_utf8_pat;
faec1544 6193
288b8c02 6194 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
6195
6196 /* Save *all* the positions. */
6197 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b
DM
6198 rex_sv = cur_eval->u.eval.prev_rex;
6199 SET_reg_curpm(rex_sv);
8d919b0a 6200 rex = ReANY(rex_sv);
f8fc2ecf 6201 rexi = RXi_GET(rex);
faec1544 6202 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6203
faec1544 6204 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6205
6206 /* Restore parens of the outer rex without popping the
6207 * savestack */
92da3157
DM
6208 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6209 &maxopenparen);
faec1544
DM
6210
6211 st->u.eval.prev_eval = cur_eval;
6212 cur_eval = cur_eval->u.eval.prev_eval;
6213 DEBUG_EXECUTE_r(
2a49f0f5
JH
6214 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6215 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6216 if ( nochange_depth )
6217 nochange_depth--;
6218
4d5016e5
DM
6219 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6220 locinput); /* match B */
faec1544
DM
6221 }
6222
3b0527fe 6223 if (locinput < reginfo->till) {
a3621e74 6224 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6225 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6226 PL_colors[4],
6227 (long)(locinput - PL_reg_starttry),
3b0527fe 6228 (long)(reginfo->till - PL_reg_starttry),
7821416a 6229 PL_colors[5]));
58e23c8d 6230
262b90c4 6231 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6232 }
262b90c4 6233 sayYES; /* Success! */
dad79028
DM
6234
6235 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6236 DEBUG_EXECUTE_r(
6237 PerlIO_printf(Perl_debug_log,
6238 "%*s %ssubpattern success...%s\n",
5bc10b2c 6239 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6240 sayYES; /* Success! */
dad79028 6241
40a82448
DM
6242#undef ST
6243#define ST st->u.ifmatch
6244
37f53970
DM
6245 {
6246 char *newstart;
6247
40a82448
DM
6248 case SUSPEND: /* (?>A) */
6249 ST.wanted = 1;
37f53970 6250 newstart = locinput;
9041c2e3 6251 goto do_ifmatch;
dad79028 6252
40a82448
DM
6253 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6254 ST.wanted = 0;
dad79028
DM
6255 goto ifmatch_trivial_fail_test;
6256
40a82448
DM
6257 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6258 ST.wanted = 1;
dad79028 6259 ifmatch_trivial_fail_test:
a0ed51b3 6260 if (scan->flags) {
52657f30 6261 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6262 if (!s) {
6263 /* trivial fail */
24d3c4a9
DM
6264 if (logical) {
6265 logical = 0;
f2338a2e 6266 sw = 1 - cBOOL(ST.wanted);
dad79028 6267 }
40a82448 6268 else if (ST.wanted)
dad79028
DM
6269 sayNO;
6270 next = scan + ARG(scan);
6271 if (next == scan)
6272 next = NULL;
6273 break;
6274 }
37f53970 6275 newstart = s;
a0ed51b3
LW
6276 }
6277 else
37f53970 6278 newstart = locinput;
a0ed51b3 6279
c277df42 6280 do_ifmatch:
40a82448 6281 ST.me = scan;
24d3c4a9 6282 ST.logical = logical;
24d786f4
YO
6283 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6284
40a82448 6285 /* execute body of (?...A) */
37f53970 6286 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6287 assert(0); /* NOTREACHED */
37f53970 6288 }
40a82448
DM
6289
6290 case IFMATCH_A_fail: /* body of (?...A) failed */
6291 ST.wanted = !ST.wanted;
6292 /* FALL THROUGH */
6293
6294 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6295 if (ST.logical) {
f2338a2e 6296 sw = cBOOL(ST.wanted);
40a82448
DM
6297 }
6298 else if (!ST.wanted)
6299 sayNO;
6300
37f53970
DM
6301 if (OP(ST.me) != SUSPEND) {
6302 /* restore old position except for (?>...) */
6303 locinput = st->locinput;
40a82448
DM
6304 }
6305 scan = ST.me + ARG(ST.me);
6306 if (scan == ST.me)
6307 scan = NULL;
6308 continue; /* execute B */
6309
6310#undef ST
dad79028 6311
3c0563b9
DM
6312 case LONGJMP: /* alternative with many branches compiles to
6313 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6314 next = scan + ARG(scan);
6315 if (next == scan)
6316 next = NULL;
a0d0e21e 6317 break;
3c0563b9
DM
6318
6319 case COMMIT: /* (*COMMIT) */
e2e6a0f1
YO
6320 reginfo->cutpoint = PL_regeol;
6321 /* FALLTHROUGH */
3c0563b9
DM
6322
6323 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6324 if (!scan->flags)
ad64d0ec 6325 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6326 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6327 assert(0); /* NOTREACHED */
3c0563b9 6328
54612592
YO
6329 case COMMIT_next_fail:
6330 no_final = 1;
6331 /* FALLTHROUGH */
3c0563b9
DM
6332
6333 case OPFAIL: /* (*FAIL) */
7f69552c 6334 sayNO;
118e2215 6335 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6336
6337#define ST st->u.mark
3c0563b9 6338 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6339 ST.prev_mark = mark_state;
5d458dd8 6340 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6341 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6342 mark_state = st;
4d5016e5
DM
6343 ST.mark_loc = locinput;
6344 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6345 assert(0); /* NOTREACHED */
3c0563b9 6346
e2e6a0f1
YO
6347 case MARKPOINT_next:
6348 mark_state = ST.prev_mark;
6349 sayYES;
118e2215 6350 assert(0); /* NOTREACHED */
3c0563b9 6351
e2e6a0f1 6352 case MARKPOINT_next_fail:
5d458dd8 6353 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6354 {
6355 if (ST.mark_loc > startpoint)
6356 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6357 popmark = NULL; /* we found our mark */
6358 sv_commit = ST.mark_name;
6359
6360 DEBUG_EXECUTE_r({
5d458dd8 6361 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6362 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6363 REPORT_CODE_OFF+depth*2, "",
be2597df 6364 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6365 });
6366 }
6367 mark_state = ST.prev_mark;
5d458dd8
YO
6368 sv_yes_mark = mark_state ?
6369 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6370 sayNO;
118e2215 6371 assert(0); /* NOTREACHED */
3c0563b9
DM
6372
6373 case SKIP: /* (*SKIP) */
5d458dd8 6374 if (scan->flags) {
2bf803e2 6375 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6376 ST.mark_name = NULL;
e2e6a0f1 6377 ST.mark_loc = locinput;
4d5016e5 6378 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6379 } else {
2bf803e2 6380 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6381 otherwise do nothing. Meaning we need to scan
6382 */
6383 regmatch_state *cur = mark_state;
ad64d0ec 6384 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6385
6386 while (cur) {
6387 if ( sv_eq( cur->u.mark.mark_name,
6388 find ) )
6389 {
6390 ST.mark_name = find;
4d5016e5 6391 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6392 }
6393 cur = cur->u.mark.prev_mark;
6394 }
e2e6a0f1 6395 }
2bf803e2 6396 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6397 break;
3c0563b9 6398
5d458dd8
YO
6399 case SKIP_next_fail:
6400 if (ST.mark_name) {
6401 /* (*CUT:NAME) - Set up to search for the name as we
6402 collapse the stack*/
6403 popmark = ST.mark_name;
6404 } else {
6405 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6406 if (ST.mark_loc > startpoint)
6407 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6408 /* but we set sv_commit to latest mark_name if there
6409 is one so they can test to see how things lead to this
6410 cut */
6411 if (mark_state)
6412 sv_commit=mark_state->u.mark.mark_name;
6413 }
e2e6a0f1
YO
6414 no_final = 1;
6415 sayNO;
118e2215 6416 assert(0); /* NOTREACHED */
e2e6a0f1 6417#undef ST
3c0563b9
DM
6418
6419 case LNBREAK: /* \R */
7016d6eb 6420 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
e1d1eefb 6421 locinput += n;
e1d1eefb
YO
6422 } else
6423 sayNO;
6424 break;
6425
a0d0e21e 6426 default:
b900a521 6427 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6428 PTR2UV(scan), OP(scan));
cea2e8a9 6429 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6430
6431 /* this is a point to jump to in order to increment
6432 * locinput by one character */
6433 increment_locinput:
e6ca698c 6434 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6435 if (utf8_target) {
6436 locinput += PL_utf8skip[nextchr];
7016d6eb 6437 /* locinput is allowed to go 1 char off the end, but not 2+ */
28b98f76
DM
6438 if (locinput > PL_regeol)
6439 sayNO;
28b98f76
DM
6440 }
6441 else
3640db6b 6442 locinput++;
28b98f76 6443 break;
5d458dd8
YO
6444
6445 } /* end switch */
95b24440 6446
5d458dd8
YO
6447 /* switch break jumps here */
6448 scan = next; /* prepare to execute the next op and ... */
6449 continue; /* ... jump back to the top, reusing st */
118e2215 6450 assert(0); /* NOTREACHED */
95b24440 6451
40a82448
DM
6452 push_yes_state:
6453 /* push a state that backtracks on success */
6454 st->u.yes.prev_yes_state = yes_state;
6455 yes_state = st;
6456 /* FALL THROUGH */
6457 push_state:
6458 /* push a new regex state, then continue at scan */
6459 {
6460 regmatch_state *newst;
6461
24b23f37
YO
6462 DEBUG_STACK_r({
6463 regmatch_state *cur = st;
6464 regmatch_state *curyes = yes_state;
6465 int curd = depth;
6466 regmatch_slab *slab = PL_regmatch_slab;
6467 for (;curd > -1;cur--,curd--) {
6468 if (cur < SLAB_FIRST(slab)) {
6469 slab = slab->prev;
6470 cur = SLAB_LAST(slab);
6471 }
6472 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6473 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6474 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6475 (curyes == cur) ? "yes" : ""
6476 );
6477 if (curyes == cur)
6478 curyes = cur->u.yes.prev_yes_state;
6479 }
6480 } else
6481 DEBUG_STATE_pp("push")
6482 );
40a82448 6483 depth++;
40a82448
DM
6484 st->locinput = locinput;
6485 newst = st+1;
6486 if (newst > SLAB_LAST(PL_regmatch_slab))
6487 newst = S_push_slab(aTHX);
6488 PL_regmatch_state = newst;
786e8c11 6489
4d5016e5 6490 locinput = pushinput;
40a82448
DM
6491 st = newst;
6492 continue;
118e2215 6493 assert(0); /* NOTREACHED */
40a82448 6494 }
a0d0e21e 6495 }
a687059c 6496
a0d0e21e
LW
6497 /*
6498 * We get here only if there's trouble -- normally "case END" is
6499 * the terminating point.
6500 */
cea2e8a9 6501 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6502 /*NOTREACHED*/
4633a7c4
LW
6503 sayNO;
6504
262b90c4 6505yes:
77cb431f
DM
6506 if (yes_state) {
6507 /* we have successfully completed a subexpression, but we must now
6508 * pop to the state marked by yes_state and continue from there */
77cb431f 6509 assert(st != yes_state);
5bc10b2c
DM
6510#ifdef DEBUGGING
6511 while (st != yes_state) {
6512 st--;
6513 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6514 PL_regmatch_slab = PL_regmatch_slab->prev;
6515 st = SLAB_LAST(PL_regmatch_slab);
6516 }
e2e6a0f1 6517 DEBUG_STATE_r({
54612592
YO
6518 if (no_final) {
6519 DEBUG_STATE_pp("pop (no final)");
6520 } else {
6521 DEBUG_STATE_pp("pop (yes)");
6522 }
e2e6a0f1 6523 });
5bc10b2c
DM
6524 depth--;
6525 }
6526#else
77cb431f
DM
6527 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6528 || yes_state > SLAB_LAST(PL_regmatch_slab))
6529 {
6530 /* not in this slab, pop slab */
6531 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6532 PL_regmatch_slab = PL_regmatch_slab->prev;
6533 st = SLAB_LAST(PL_regmatch_slab);
6534 }
6535 depth -= (st - yes_state);
5bc10b2c 6536#endif
77cb431f
DM
6537 st = yes_state;
6538 yes_state = st->u.yes.prev_yes_state;
6539 PL_regmatch_state = st;
24b23f37 6540
3640db6b 6541 if (no_final)
5d458dd8 6542 locinput= st->locinput;
54612592 6543 state_num = st->resume_state + no_final;
24d3c4a9 6544 goto reenter_switch;
77cb431f
DM
6545 }
6546
a3621e74 6547 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6548 PL_colors[4], PL_colors[5]));
02db2b7b 6549
ed301438 6550 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
6551 /* each successfully executed (?{...}) block does the equivalent of
6552 * local $^R = do {...}
6553 * When popping the save stack, all these locals would be undone;
6554 * bypass this by setting the outermost saved $^R to the latest
6555 * value */
6556 if (oreplsv != GvSV(PL_replgv))
6557 sv_setsv(oreplsv, GvSV(PL_replgv));
6558 }
95b24440 6559 result = 1;
aa283a38 6560 goto final_exit;
4633a7c4
LW
6561
6562no:
a3621e74 6563 DEBUG_EXECUTE_r(
7821416a 6564 PerlIO_printf(Perl_debug_log,
786e8c11 6565 "%*s %sfailed...%s\n",
5bc10b2c 6566 REPORT_CODE_OFF+depth*2, "",
786e8c11 6567 PL_colors[4], PL_colors[5])
7821416a 6568 );
aa283a38 6569
262b90c4 6570no_silent:
54612592
YO
6571 if (no_final) {
6572 if (yes_state) {
6573 goto yes;
6574 } else {
6575 goto final_exit;
6576 }
6577 }
aa283a38
DM
6578 if (depth) {
6579 /* there's a previous state to backtrack to */
40a82448
DM
6580 st--;
6581 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6582 PL_regmatch_slab = PL_regmatch_slab->prev;
6583 st = SLAB_LAST(PL_regmatch_slab);
6584 }
6585 PL_regmatch_state = st;
40a82448 6586 locinput= st->locinput;
40a82448 6587
5bc10b2c
DM
6588 DEBUG_STATE_pp("pop");
6589 depth--;
262b90c4
DM
6590 if (yes_state == st)
6591 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6592
24d3c4a9
DM
6593 state_num = st->resume_state + 1; /* failure = success + 1 */
6594 goto reenter_switch;
95b24440 6595 }
24d3c4a9 6596 result = 0;
aa283a38 6597
262b90c4 6598 final_exit:
bbe252da 6599 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6600 SV *sv_err = get_sv("REGERROR", 1);
6601 SV *sv_mrk = get_sv("REGMARK", 1);
6602 if (result) {
e2e6a0f1 6603 sv_commit = &PL_sv_no;
5d458dd8
YO
6604 if (!sv_yes_mark)
6605 sv_yes_mark = &PL_sv_yes;
6606 } else {
6607 if (!sv_commit)
6608 sv_commit = &PL_sv_yes;
6609 sv_yes_mark = &PL_sv_no;
6610 }
6611 sv_setsv(sv_err, sv_commit);
6612 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6613 }
19b95bf0 6614
81ed78b2
DM
6615
6616 if (last_pushed_cv) {
6617 dSP;
6618 POP_MULTICALL;
4f8dbb2d 6619 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6620 }
6621
2f554ef7
DM
6622 /* clean up; in particular, free all slabs above current one */
6623 LEAVE_SCOPE(oldsave);
5d9a96ca 6624
730f4c74
DM
6625 assert(!result || locinput - PL_bostr >= 0);
6626 return result ? locinput - PL_bostr : -1;
a687059c
LW
6627}
6628
6629/*
6630 - regrepeat - repeatedly match something simple, report how many
d60de1d1 6631 *
e64f369d
KW
6632 * What 'simple' means is a node which can be the operand of a quantifier like
6633 * '+', or {1,3}
6634 *
d60de1d1
DM
6635 * startposp - pointer a pointer to the start position. This is updated
6636 * to point to the byte following the highest successful
6637 * match.
6638 * p - the regnode to be repeatedly matched against.
4063ade8 6639 * max - maximum number of things to match.
d60de1d1 6640 * depth - (for debugging) backtracking depth.
a687059c 6641 */
76e3520e 6642STATIC I32
272d35c9 6643S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
984e6dd1 6644 I32 max, int depth, bool is_utf8_pat)
a687059c 6645{
27da23d5 6646 dVAR;
4063ade8 6647 char *scan; /* Pointer to current position in target string */
eb578fdb 6648 I32 c;
4063ade8
KW
6649 char *loceol = PL_regeol; /* local version */
6650 I32 hardcount = 0; /* How many matches so far */
eb578fdb 6651 bool utf8_target = PL_reg_match_utf8;
3018b823 6652 int to_complement = 0; /* Invert the result? */
d513472c 6653 UV utf8_flags;
3018b823 6654 _char_class_number classnum;
4f55667c
SP
6655#ifndef DEBUGGING
6656 PERL_UNUSED_ARG(depth);
6657#endif
a0d0e21e 6658
7918f24d
NC
6659 PERL_ARGS_ASSERT_REGREPEAT;
6660
f73aaa43 6661 scan = *startposp;
faf11cac
HS
6662 if (max == REG_INFTY)
6663 max = I32_MAX;
4063ade8 6664 else if (! utf8_target && scan + max < loceol)
7f596f4c 6665 loceol = scan + max;
4063ade8
KW
6666
6667 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6668 * to the maximum of how far we should go in it (leaving it set to the real
6669 * end, if the maximum permissible would take us beyond that). This allows
6670 * us to make the loop exit condition that we haven't gone past <loceol> to
6671 * also mean that we haven't exceeded the max permissible count, saving a
6672 * test each time through the loop. But it assumes that the OP matches a
6673 * single byte, which is true for most of the OPs below when applied to a
6674 * non-UTF-8 target. Those relatively few OPs that don't have this
6675 * characteristic will have to compensate.
6676 *
6677 * There is no adjustment for UTF-8 targets, as the number of bytes per
6678 * character varies. OPs will have to test both that the count is less
6679 * than the max permissible (using <hardcount> to keep track), and that we
6680 * are still within the bounds of the string (using <loceol>. A few OPs
6681 * match a single byte no matter what the encoding. They can omit the max
6682 * test if, for the UTF-8 case, they do the adjustment that was skipped
6683 * above.
6684 *
6685 * Thus, the code above sets things up for the common case; and exceptional
6686 * cases need extra work; the common case is to make sure <scan> doesn't
6687 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6688 * count doesn't exceed the maximum permissible */
6689
a0d0e21e 6690 switch (OP(p)) {
22c35a8c 6691 case REG_ANY:
f2ed9b32 6692 if (utf8_target) {
1aa99e6b 6693 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6694 scan += UTF8SKIP(scan);
6695 hardcount++;
6696 }
6697 } else {
6698 while (scan < loceol && *scan != '\n')
6699 scan++;
a0ed51b3
LW
6700 }
6701 break;
ffc61ed2 6702 case SANY:
f2ed9b32 6703 if (utf8_target) {
a0804c9e 6704 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6705 scan += UTF8SKIP(scan);
6706 hardcount++;
6707 }
6708 }
6709 else
6710 scan = loceol;
a0ed51b3 6711 break;
4063ade8
KW
6712 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6713 if (utf8_target && scan + max < loceol) {
6714
6715 /* <loceol> hadn't been adjusted in the UTF-8 case */
6716 scan += max;
6717 }
6718 else {
6719 scan = loceol;
6720 }
f33976b4 6721 break;
59d32103 6722 case EXACT:
984e6dd1 6723 assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6724
59d32103 6725 c = (U8)*STRING(p);
59d32103 6726
5e4a1da1
KW
6727 /* Can use a simple loop if the pattern char to match on is invariant
6728 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6729 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6730 * true iff it doesn't matter if the argument is in UTF-8 or not */
984e6dd1 6731 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! is_utf8_pat)) {
4063ade8
KW
6732 if (utf8_target && scan + max < loceol) {
6733 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6734 * since here, to match at all, 1 char == 1 byte */
6735 loceol = scan + max;
6736 }
59d32103
KW
6737 while (scan < loceol && UCHARAT(scan) == c) {
6738 scan++;
6739 }
6740 }
984e6dd1 6741 else if (is_utf8_pat) {
5e4a1da1
KW
6742 if (utf8_target) {
6743 STRLEN scan_char_len;
5e4a1da1 6744
4063ade8 6745 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
6746 * string EQ */
6747 while (hardcount < max
9a902117
KW
6748 && scan < loceol
6749 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
6750 && memEQ(scan, STRING(p), scan_char_len))
6751 {
4200a00c 6752 scan += scan_char_len;
5e4a1da1
KW
6753 hardcount++;
6754 }
6755 }
6756 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 6757
5e4a1da1
KW
6758 /* Target isn't utf8; convert the character in the UTF-8
6759 * pattern to non-UTF8, and do a simple loop */
6760 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6761 while (scan < loceol && UCHARAT(scan) == c) {
6762 scan++;
6763 }
6764 } /* else pattern char is above Latin1, can't possibly match the
6765 non-UTF-8 target */
b40a2c17 6766 }
5e4a1da1 6767 else {
59d32103 6768
5e4a1da1
KW
6769 /* Here, the string must be utf8; pattern isn't, and <c> is
6770 * different in utf8 than not, so can't compare them directly.
6771 * Outside the loop, find the two utf8 bytes that represent c, and
6772 * then look for those in sequence in the utf8 string */
59d32103
KW
6773 U8 high = UTF8_TWO_BYTE_HI(c);
6774 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
6775
6776 while (hardcount < max
6777 && scan + 1 < loceol
6778 && UCHARAT(scan) == high
6779 && UCHARAT(scan + 1) == low)
6780 {
6781 scan += 2;
6782 hardcount++;
6783 }
6784 }
6785 break;
5e4a1da1 6786
2f7f8cb1
KW
6787 case EXACTFA:
6788 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6789 goto do_exactf;
6790
d4e0b827 6791 case EXACTFL:
272d35c9 6792 RXp_MATCH_TAINTED_on(prog);
17580e7a
KW
6793 utf8_flags = FOLDEQ_UTF8_LOCALE;
6794 goto do_exactf;
6795
d4e0b827 6796 case EXACTF:
62bf7766
KW
6797 utf8_flags = 0;
6798 goto do_exactf;
6799
3c760661 6800 case EXACTFU_SS:
fab2782b 6801 case EXACTFU_TRICKYFOLD:
9a5a5549 6802 case EXACTFU:
984e6dd1 6803 utf8_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6804
613a425d
KW
6805 do_exactf: {
6806 int c1, c2;
6807 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 6808
984e6dd1 6809 assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6810
984e6dd1
DM
6811 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6812 is_utf8_pat))
6813 {
613a425d 6814 if (c1 == CHRTEST_VOID) {
49b95fad 6815 /* Use full Unicode fold matching */
4063ade8 6816 char *tmpeol = PL_regeol;
984e6dd1 6817 STRLEN pat_len = is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
6818 while (hardcount < max
6819 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6820 STRING(p), NULL, pat_len,
984e6dd1 6821 is_utf8_pat, utf8_flags))
49b95fad
KW
6822 {
6823 scan = tmpeol;
4063ade8 6824 tmpeol = PL_regeol;
49b95fad
KW
6825 hardcount++;
6826 }
613a425d
KW
6827 }
6828 else if (utf8_target) {
6829 if (c1 == c2) {
4063ade8
KW
6830 while (scan < loceol
6831 && hardcount < max
613a425d
KW
6832 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6833 {
6834 scan += UTF8SKIP(scan);
6835 hardcount++;
6836 }
6837 }
6838 else {
4063ade8
KW
6839 while (scan < loceol
6840 && hardcount < max
613a425d
KW
6841 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6842 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6843 {
6844 scan += UTF8SKIP(scan);
6845 hardcount++;
6846 }
6847 }
6848 }
6849 else if (c1 == c2) {
6850 while (scan < loceol && UCHARAT(scan) == c1) {
6851 scan++;
6852 }
6853 }
6854 else {
6855 while (scan < loceol &&
6856 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6857 {
6858 scan++;
6859 }
6860 }
634c83a2 6861 }
bbce6d69 6862 break;
613a425d 6863 }
a0d0e21e 6864 case ANYOF:
954a2af6 6865 case ANYOF_WARN_SUPER:
e0193e47 6866 if (utf8_target) {
4e8910e0 6867 while (hardcount < max
9a902117 6868 && scan < loceol
635cd5d4 6869 && reginclass(prog, p, (U8*)scan, utf8_target))
4e8910e0 6870 {
9a902117 6871 scan += UTF8SKIP(scan);
ffc61ed2
JH
6872 hardcount++;
6873 }
6874 } else {
32fc9b6a 6875 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6876 scan++;
6877 }
a0d0e21e 6878 break;
4063ade8 6879
3018b823 6880 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 6881
3018b823
KW
6882 case NPOSIXL:
6883 to_complement = 1;
6884 /* FALLTHROUGH */
980866de 6885
3018b823 6886 case POSIXL:
272d35c9 6887 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
6888 if (! utf8_target) {
6889 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6890 *scan)))
a12cf05f 6891 {
3018b823
KW
6892 scan++;
6893 }
6894 } else {
6895 while (hardcount < max && scan < loceol
6896 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6897 (U8 *) scan)))
6898 {
6899 scan += UTF8SKIP(scan);
ffc61ed2
JH
6900 hardcount++;
6901 }
a0ed51b3
LW
6902 }
6903 break;
0658cdde 6904
3018b823
KW
6905 case POSIXD:
6906 if (utf8_target) {
6907 goto utf8_posix;
6908 }
6909 /* FALLTHROUGH */
6910
0658cdde 6911 case POSIXA:
4063ade8
KW
6912 if (utf8_target && scan + max < loceol) {
6913
7aee35ff
KW
6914 /* We didn't adjust <loceol> at the beginning of this routine
6915 * because is UTF-8, but it is actually ok to do so, since here, to
6916 * match, 1 char == 1 byte. */
4063ade8
KW
6917 loceol = scan + max;
6918 }
6919 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
6920 scan++;
6921 }
6922 break;
980866de 6923
3018b823
KW
6924 case NPOSIXD:
6925 if (utf8_target) {
6926 to_complement = 1;
6927 goto utf8_posix;
6928 }
6929 /* FALL THROUGH */
980866de 6930
3018b823
KW
6931 case NPOSIXA:
6932 if (! utf8_target) {
6933 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
6934 scan++;
6935 }
4063ade8 6936 }
3018b823 6937 else {
980866de 6938
3018b823
KW
6939 /* The complement of something that matches only ASCII matches all
6940 * UTF-8 variant code points, plus everything in ASCII that isn't
6941 * in the class. */
bedac28b 6942 while (hardcount < max && scan < loceol
3018b823
KW
6943 && (! UTF8_IS_INVARIANT(*scan)
6944 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 6945 {
3018b823 6946 scan += UTF8SKIP(scan);
ffc61ed2
JH
6947 hardcount++;
6948 }
3018b823
KW
6949 }
6950 break;
980866de 6951
3018b823
KW
6952 case NPOSIXU:
6953 to_complement = 1;
6954 /* FALLTHROUGH */
6955
6956 case POSIXU:
6957 if (! utf8_target) {
6958 while (scan < loceol && to_complement
6959 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 6960 {
3018b823
KW
6961 scan++;
6962 }
cfaf538b
KW
6963 }
6964 else {
3018b823
KW
6965 utf8_posix:
6966 classnum = (_char_class_number) FLAGS(p);
6967 if (classnum < _FIRST_NON_SWASH_CC) {
6968
6969 /* Here, a swash is needed for above-Latin1 code points.
6970 * Process as many Latin1 code points using the built-in rules.
6971 * Go to another loop to finish processing upon encountering
6972 * the first Latin1 code point. We could do that in this loop
6973 * as well, but the other way saves having to test if the swash
6974 * has been loaded every time through the loop: extra space to
6975 * save a test. */
6976 while (hardcount < max && scan < loceol) {
6977 if (UTF8_IS_INVARIANT(*scan)) {
6978 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6979 classnum))))
6980 {
6981 break;
6982 }
6983 scan++;
6984 }
6985 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6986 if (! (to_complement
6987 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6988 *(scan + 1)),
6989 classnum))))
6990 {
6991 break;
6992 }
6993 scan += 2;
6994 }
6995 else {
6996 goto found_above_latin1;
6997 }
6998
6999 hardcount++;
7000 }
7001 }
7002 else {
7003 /* For these character classes, the knowledge of how to handle
7004 * every code point is compiled in to Perl via a macro. This
7005 * code is written for making the loops as tight as possible.
7006 * It could be refactored to save space instead */
7007 switch (classnum) {
7008 case _CC_ENUM_SPACE: /* XXX would require separate code
7009 if we revert the change of \v
7010 matching this */
7011 /* FALL THROUGH */
7012 case _CC_ENUM_PSXSPC:
7013 while (hardcount < max
7014 && scan < loceol
7015 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7016 {
7017 scan += UTF8SKIP(scan);
7018 hardcount++;
7019 }
7020 break;
7021 case _CC_ENUM_BLANK:
7022 while (hardcount < max
7023 && scan < loceol
7024 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7025 {
7026 scan += UTF8SKIP(scan);
7027 hardcount++;
7028 }
7029 break;
7030 case _CC_ENUM_XDIGIT:
7031 while (hardcount < max
7032 && scan < loceol
7033 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7034 {
7035 scan += UTF8SKIP(scan);
7036 hardcount++;
7037 }
7038 break;
7039 case _CC_ENUM_VERTSPACE:
7040 while (hardcount < max
7041 && scan < loceol
7042 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7043 {
7044 scan += UTF8SKIP(scan);
7045 hardcount++;
7046 }
7047 break;
7048 case _CC_ENUM_CNTRL:
7049 while (hardcount < max
7050 && scan < loceol
7051 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7052 {
7053 scan += UTF8SKIP(scan);
7054 hardcount++;
7055 }
7056 break;
7057 default:
7058 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7059 }
7060 }
a0ed51b3 7061 }
3018b823 7062 break;
4063ade8 7063
3018b823
KW
7064 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7065
7066 /* Load the swash if not already present */
7067 if (! PL_utf8_swash_ptrs[classnum]) {
7068 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7069 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7070 "utf8", swash_property_names[classnum],
7071 &PL_sv_undef, 1, 0, NULL, &flags);
4063ade8 7072 }
3018b823
KW
7073
7074 while (hardcount < max && scan < loceol
7075 && to_complement ^ cBOOL(_generic_utf8(
7076 classnum,
7077 scan,
7078 swash_fetch(PL_utf8_swash_ptrs[classnum],
7079 (U8 *) scan,
7080 TRUE))))
7081 {
7082 scan += UTF8SKIP(scan);
7083 hardcount++;
7084 }
7085 break;
7086
e1d1eefb 7087 case LNBREAK:
e64f369d
KW
7088 if (utf8_target) {
7089 while (hardcount < max && scan < loceol &&
7090 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7091 scan += c;
7092 hardcount++;
7093 }
7094 } else {
7095 /* LNBREAK can match one or two latin chars, which is ok, but we
7096 * have to use hardcount in this situation, and throw away the
7097 * adjustment to <loceol> done before the switch statement */
7098 loceol = PL_regeol;
7099 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7100 scan+=c;
7101 hardcount++;
7102 }
7103 }
7104 break;
e1d1eefb 7105
584b1f02
KW
7106 case BOUND:
7107 case BOUNDA:
7108 case BOUNDL:
7109 case BOUNDU:
7110 case EOS:
7111 case GPOS:
7112 case KEEPS:
7113 case NBOUND:
7114 case NBOUNDA:
7115 case NBOUNDL:
7116 case NBOUNDU:
7117 case OPFAIL:
7118 case SBOL:
7119 case SEOL:
7120 /* These are all 0 width, so match right here or not at all. */
7121 break;
7122
7123 default:
7124 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7125 assert(0); /* NOTREACHED */
7126
a0d0e21e 7127 }
a687059c 7128
a0ed51b3
LW
7129 if (hardcount)
7130 c = hardcount;
7131 else
f73aaa43
DM
7132 c = scan - *startposp;
7133 *startposp = scan;
a687059c 7134
a3621e74 7135 DEBUG_r({
e68ec53f 7136 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7137 DEBUG_EXECUTE_r({
e68ec53f
YO
7138 SV * const prop = sv_newmortal();
7139 regprop(prog, prop, p);
7140 PerlIO_printf(Perl_debug_log,
be8e71aa 7141 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7142 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7143 });
be8e71aa 7144 });
9041c2e3 7145
a0d0e21e 7146 return(c);
a687059c
LW
7147}
7148
c277df42 7149
be8e71aa 7150#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7151/*
6c6525b8 7152- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7153create a copy so that changes the caller makes won't change the shared one.
7154If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7155 */
ffc61ed2 7156SV *
5aaab254 7157Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7158{
6c6525b8 7159 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7160
7161 if (altsvp) {
7162 *altsvp = NULL;
7163 }
7164
7165 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
6c6525b8
KW
7166}
7167#endif
7168
7169STATIC SV *
5aaab254 7170S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
6c6525b8 7171{
8c9eb58f
KW
7172 /* Returns the swash for the input 'node' in the regex 'prog'.
7173 * If <doinit> is true, will attempt to create the swash if not already
7174 * done.
7175 * If <listsvp> is non-null, will return the swash initialization string in
7176 * it.
8c9eb58f
KW
7177 * Tied intimately to how regcomp.c sets up the data structure */
7178
97aff369 7179 dVAR;
9e55ce06
JH
7180 SV *sw = NULL;
7181 SV *si = NULL;
7a6c6baa
KW
7182 SV* invlist = NULL;
7183
f8fc2ecf
YO
7184 RXi_GET_DECL(prog,progi);
7185 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7186
6c6525b8 7187 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 7188
ccb2541c
KW
7189 assert(ANYOF_NONBITMAP(node));
7190
4f639d21 7191 if (data && data->count) {
a3b680e6 7192 const U32 n = ARG(node);
ffc61ed2 7193
4f639d21 7194 if (data->what[n] == 's') {
ad64d0ec
NC
7195 SV * const rv = MUTABLE_SV(data->data[n]);
7196 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7197 SV **const ary = AvARRAY(av);
87367d5f 7198 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7199
8c9eb58f 7200 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7201
88675427
KW
7202 /* Elements 2 and 3 are either both present or both absent. [2] is
7203 * any inversion list generated at compile time; [3] indicates if
7a6c6baa 7204 * that inversion list has any user-defined properties in it. */
88675427
KW
7205 if (av_len(av) >= 2) {
7206 invlist = ary[2];
7207 if (SvUV(ary[3])) {
83199d38
KW
7208 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7209 }
7a6c6baa
KW
7210 }
7211 else {
7212 invlist = NULL;
7a6c6baa
KW
7213 }
7214
8c9eb58f
KW
7215 /* Element [1] is reserved for the set-up swash. If already there,
7216 * return it; if not, create it and store it there */
f192cf32
KW
7217 if (SvROK(ary[1])) {
7218 sw = ary[1];
7219 }
ffc61ed2 7220 else if (si && doinit) {
7a6c6baa
KW
7221
7222 sw = _core_swash_init("utf8", /* the utf8 package */
7223 "", /* nameless */
7224 si,
7225 1, /* binary */
7226 0, /* not from tr/// */
7a6c6baa 7227 invlist,
83199d38 7228 &swash_init_flags);
ffc61ed2
JH
7229 (void)av_store(av, 1, sw);
7230 }
7231 }
7232 }
7233
7a6c6baa
KW
7234 if (listsvp) {
7235 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
7236
7237 /* Use the swash, if any, which has to have incorporated into it all
7238 * possibilities */
872dd7e0
KW
7239 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7240 && (si && si != &PL_sv_undef))
7241 {
7a6c6baa 7242
872dd7e0 7243 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
7244 sv_catsv(matches_string, si);
7245 }
7246
7247 /* Add the inversion list to whatever we have. This may have come from
7248 * the swash, or from an input parameter */
7249 if (invlist) {
7250 sv_catsv(matches_string, _invlist_contents(invlist));
7251 }
7252 *listsvp = matches_string;
7253 }
7254
ffc61ed2
JH
7255 return sw;
7256}
7257
7258/*
ba7b4546 7259 - reginclass - determine if a character falls into a character class
832705d4 7260
6698fab5
KW
7261 n is the ANYOF regnode
7262 p is the target string
6698fab5 7263 utf8_target tells whether p is in UTF-8.
832705d4 7264
635cd5d4 7265 Returns true if matched; false otherwise.
eba1359e 7266
d5788240
KW
7267 Note that this can be a synthetic start class, a combination of various
7268 nodes, so things you think might be mutually exclusive, such as locale,
7269 aren't. It can match both locale and non-locale
7270
bbce6d69 7271 */
7272
76e3520e 7273STATIC bool
272d35c9 7274S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
bbce6d69 7275{
27da23d5 7276 dVAR;
a3b680e6 7277 const char flags = ANYOF_FLAGS(n);
bbce6d69 7278 bool match = FALSE;
cc07378b 7279 UV c = *p;
1aa99e6b 7280
7918f24d
NC
7281 PERL_ARGS_ASSERT_REGINCLASS;
7282
afd2eb18
KW
7283 /* If c is not already the code point, get it. Note that
7284 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7285 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7286 STRLEN c_len = 0;
f7ab54c6 7287 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
7288 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7289 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7290 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7291 * UTF8_ALLOW_FFFF */
f7ab54c6 7292 if (c_len == (STRLEN)-1)
e8a70c6f 7293 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7294 }
4b3cda86 7295
7cdde544
KW
7296 /* If this character is potentially in the bitmap, check it */
7297 if (c < 256) {
ffc61ed2
JH
7298 if (ANYOF_BITMAP_TEST(n, c))
7299 match = TRUE;
11454c59
KW
7300 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7301 && ! utf8_target
7302 && ! isASCII(c))
7303 {
7304 match = TRUE;
7305 }
78969a98 7306 else if (flags & ANYOF_LOCALE) {
272d35c9 7307 RXp_MATCH_TAINTED_on(prog);
78969a98 7308
538b546e 7309 if ((flags & ANYOF_LOC_FOLD)
78969a98
KW
7310 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7311 {
ffc61ed2 7312 match = TRUE;
78969a98 7313 }
31c7f561
KW
7314 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7315
7316 /* The data structure is arranged so bits 0, 2, 4, ... are set
7317 * if the class includes the Posix character class given by
7318 * bit/2; and 1, 3, 5, ... are set if the class includes the
7319 * complemented Posix class given by int(bit/2). So we loop
7320 * through the bits, each time changing whether we complement
7321 * the result or not. Suppose for the sake of illustration
7322 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7323 * is set, it means there is a match for this ANYOF node if the
7324 * character is in the class given by the expression (0 / 2 = 0
7325 * = \w). If it is in that class, isFOO_lc() will return 1,
7326 * and since 'to_complement' is 0, the result will stay TRUE,
7327 * and we exit the loop. Suppose instead that bit 0 is 0, but
7328 * bit 1 is 1. That means there is a match if the character
7329 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7330 * but will on bit 1. On the second iteration 'to_complement'
7331 * will be 1, so the exclusive or will reverse things, so we
7332 * are testing for \W. On the third iteration, 'to_complement'
7333 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
7334 * iteration would test for \S, etc.
7335 *
7336 * Note that this code assumes that all the classes are closed
7337 * under folding. For example, if a character matches \w, then
7338 * its fold does too; and vice versa. This should be true for
7339 * any well-behaved locale for all the currently defined Posix
7340 * classes, except for :lower: and :upper:, which are handled
7341 * by the pseudo-class :cased: which matches if either of the
7342 * other two does. To get rid of this assumption, an outer
7343 * loop could be used below to iterate over both the source
7344 * character, and its fold (if different) */
31c7f561
KW
7345
7346 int count = 0;
7347 int to_complement = 0;
7348 while (count < ANYOF_MAX) {
7349 if (ANYOF_CLASS_TEST(n, count)
7350 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7351 {
7352 match = TRUE;
7353 break;
7354 }
7355 count++;
7356 to_complement ^= 1;
7357 }
ffc61ed2 7358 }
a0ed51b3 7359 }
a0ed51b3
LW
7360 }
7361
7cdde544 7362 /* If the bitmap didn't (or couldn't) match, and something outside the
1f327b5e 7363 * bitmap could match, try that. Locale nodes specify completely the
de87c4fe 7364 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 7365 * cause them to be treated as Unicode and not locale), except in
de87c4fe 7366 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
7367 * could be a combination of locale and non-locale nodes. So allow locale
7368 * to match for the synthetic start class, which will give a false
7369 * positive that will be resolved when the match is done again as not part
7370 * of the synthetic start class */
ef87b810 7371 if (!match) {
10ee90d2
KW
7372 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7373 match = TRUE; /* Everything above 255 matches */
e051a21d 7374 }
6f8d7d0d
KW
7375 else if (ANYOF_NONBITMAP(n)
7376 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7377 || (utf8_target
7378 && (c >=256
7379 || (! (flags & ANYOF_LOCALE))
9aa1e39f 7380 || OP(n) == ANYOF_SYNTHETIC))))
ef87b810 7381 {
e0193e47 7382 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7cdde544
KW
7383 if (sw) {
7384 U8 * utf8_p;
7385 if (utf8_target) {
7386 utf8_p = (U8 *) p;
e0193e47
KW
7387 } else { /* Convert to utf8 */
7388 STRLEN len = 1;
7cdde544
KW
7389 utf8_p = bytes_to_utf8(p, &len);
7390 }
f56b6394 7391
e0193e47 7392 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7393 match = TRUE;
e0193e47 7394 }
7cdde544
KW
7395
7396 /* If we allocated a string above, free it */
7397 if (! utf8_target) Safefree(utf8_p);
7398 }
7399 }
5073ffbd
KW
7400
7401 if (UNICODE_IS_SUPER(c)
954a2af6 7402 && OP(n) == ANYOF_WARN_SUPER
5073ffbd
KW
7403 && ckWARN_d(WARN_NON_UNICODE))
7404 {
7405 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7406 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7407 }
7cdde544
KW
7408 }
7409
f0fdc1c9
KW
7410 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7411 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7412}
161b471a 7413
dfe13c55 7414STATIC U8 *
0ce71af7 7415S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7416{
6af86488
KW
7417 /* return the position 'off' UTF-8 characters away from 's', forward if
7418 * 'off' >= 0, backwards if negative. But don't go outside of position
7419 * 'lim', which better be < s if off < 0 */
7420
97aff369 7421 dVAR;
7918f24d
NC
7422
7423 PERL_ARGS_ASSERT_REGHOP3;
7424
a0ed51b3 7425 if (off >= 0) {
1aa99e6b 7426 while (off-- && s < lim) {
ffc61ed2 7427 /* XXX could check well-formedness here */
a0ed51b3 7428 s += UTF8SKIP(s);
ffc61ed2 7429 }
a0ed51b3
LW
7430 }
7431 else {
1de06328
YO
7432 while (off++ && s > lim) {
7433 s--;
7434 if (UTF8_IS_CONTINUED(*s)) {
7435 while (s > lim && UTF8_IS_CONTINUATION(*s))
7436 s--;
a0ed51b3 7437 }
1de06328 7438 /* XXX could check well-formedness here */
a0ed51b3
LW
7439 }
7440 }
7441 return s;
7442}
161b471a 7443
f9f4320a
YO
7444#ifdef XXX_dmq
7445/* there are a bunch of places where we use two reghop3's that should
7446 be replaced with this routine. but since thats not done yet
7447 we ifdef it out - dmq
7448*/
dfe13c55 7449STATIC U8 *
1de06328
YO
7450S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7451{
7452 dVAR;
7918f24d
NC
7453
7454 PERL_ARGS_ASSERT_REGHOP4;
7455
1de06328
YO
7456 if (off >= 0) {
7457 while (off-- && s < rlim) {
7458 /* XXX could check well-formedness here */
7459 s += UTF8SKIP(s);
7460 }
7461 }
7462 else {
7463 while (off++ && s > llim) {
7464 s--;
7465 if (UTF8_IS_CONTINUED(*s)) {
7466 while (s > llim && UTF8_IS_CONTINUATION(*s))
7467 s--;
7468 }
7469 /* XXX could check well-formedness here */
7470 }
7471 }
7472 return s;
7473}
f9f4320a 7474#endif
1de06328
YO
7475
7476STATIC U8 *
0ce71af7 7477S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7478{
97aff369 7479 dVAR;
7918f24d
NC
7480
7481 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7482
a0ed51b3 7483 if (off >= 0) {
1aa99e6b 7484 while (off-- && s < lim) {
ffc61ed2 7485 /* XXX could check well-formedness here */
a0ed51b3 7486 s += UTF8SKIP(s);
ffc61ed2 7487 }
a0ed51b3 7488 if (off >= 0)
3dab1dad 7489 return NULL;
a0ed51b3
LW
7490 }
7491 else {
1de06328
YO
7492 while (off++ && s > lim) {
7493 s--;
7494 if (UTF8_IS_CONTINUED(*s)) {
7495 while (s > lim && UTF8_IS_CONTINUATION(*s))
7496 s--;
a0ed51b3 7497 }
1de06328 7498 /* XXX could check well-formedness here */
a0ed51b3
LW
7499 }
7500 if (off <= 0)
3dab1dad 7501 return NULL;
a0ed51b3
LW
7502 }
7503 return s;
7504}
51371543 7505
51371543 7506static void
acfe0abc 7507restore_pos(pTHX_ void *arg)
51371543 7508{
97aff369 7509 dVAR;
097eb12c 7510 regexp * const rex = (regexp *)arg;
ed301438 7511 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7512 if (PL_reg_oldsaved) {
4f639d21
DM
7513 rex->subbeg = PL_reg_oldsaved;
7514 rex->sublen = PL_reg_oldsavedlen;
6502e081
DM
7515 rex->suboffset = PL_reg_oldsavedoffset;
7516 rex->subcoffset = PL_reg_oldsavedcoffset;
db2c6cb3 7517#ifdef PERL_ANY_COW
4f639d21 7518 rex->saved_copy = PL_nrs;
ed252734 7519#endif
07bc277f 7520 RXp_MATCH_COPIED_on(rex);
51371543
GS
7521 }
7522 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7523 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7524 PL_curpm = PL_reg_oldcurpm;
7525 }
7526}
33b8afdf
JH
7527
7528STATIC void
5aaab254 7529S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 7530{
7e0d5ad7
KW
7531 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7532 * on the converted value */
7533
a1cac82e 7534 int i = 1;
7918f24d
NC
7535
7536 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7537
a1cac82e
NC
7538 do {
7539 if (prog->substrs->data[i].substr
7540 && !prog->substrs->data[i].utf8_substr) {
7541 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7542 prog->substrs->data[i].utf8_substr = sv;
7543 sv_utf8_upgrade(sv);
610460f9 7544 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7545 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7546 /* Trim the trailing \n that fbm_compile added last
7547 time. */
7548 SvCUR_set(sv, SvCUR(sv) - 1);
7549 /* Whilst this makes the SV technically "invalid" (as its
7550 buffer is no longer followed by "\0") when fbm_compile()
7551 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7552 fbm_compile(sv, FBMcf_TAIL);
7553 } else
7554 fbm_compile(sv, 0);
610460f9 7555 }
a1cac82e
NC
7556 if (prog->substrs->data[i].substr == prog->check_substr)
7557 prog->check_utf8 = sv;
7558 }
7559 } while (i--);
33b8afdf
JH
7560}
7561
7e0d5ad7 7562STATIC bool
5aaab254 7563S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 7564{
7e0d5ad7
KW
7565 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7566 * on the converted value; returns FALSE if can't be converted. */
7567
97aff369 7568 dVAR;
a1cac82e 7569 int i = 1;
7918f24d
NC
7570
7571 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7572
a1cac82e
NC
7573 do {
7574 if (prog->substrs->data[i].utf8_substr
7575 && !prog->substrs->data[i].substr) {
7576 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
7577 if (! sv_utf8_downgrade(sv, TRUE)) {
7578 return FALSE;
7579 }
5400f398
KW
7580 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7581 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7582 /* Trim the trailing \n that fbm_compile added last
7583 time. */
7584 SvCUR_set(sv, SvCUR(sv) - 1);
7585 fbm_compile(sv, FBMcf_TAIL);
7586 } else
7587 fbm_compile(sv, 0);
7588 }
a1cac82e
NC
7589 prog->substrs->data[i].substr = sv;
7590 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7591 prog->check_substr = sv;
33b8afdf 7592 }
a1cac82e 7593 } while (i--);
7e0d5ad7
KW
7594
7595 return TRUE;
33b8afdf 7596}
66610fdd
RGS
7597
7598/*
7599 * Local variables:
7600 * c-indentation-style: bsd
7601 * c-basic-offset: 4
14d04a33 7602 * indent-tabs-mode: nil
66610fdd
RGS
7603 * End:
7604 *
14d04a33 7605 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7606 */