This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Added descriptions to tests lacking them
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
c74f6de9
KW
96#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
e0193e47 102/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
635cd5d4 105#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 106 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
f2ed9b32 112#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 113#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 114
3dab1dad
YO
115#define HOPc(pos,off) \
116 (char *)(PL_reg_match_utf8 \
52657f30 117 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
118 : (U8*)(pos + off))
119#define HOPBACKc(pos, off) \
07be1b83
YO
120 (char*)(PL_reg_match_utf8\
121 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
122 : (pos - off >= PL_bostr) \
8e11feef 123 ? (U8*)pos - off \
3dab1dad 124 : NULL)
efb30f32 125
e7409c1b 126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 127#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 128
7016d6eb
DM
129
130#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
131#define NEXTCHR_IS_EOS (nextchr < 0)
132
133#define SET_nextchr \
134 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
135
136#define SET_locinput(p) \
137 locinput = (p); \
138 SET_nextchr
139
140
c7304fe2
KW
141#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
142 if (!swash_ptr) { \
143 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2
KW
144 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
145 1, 0, NULL, &flags); \
146 assert(swash_ptr); \
147 } \
148 } STMT_END
149
150/* If in debug mode, we test that a known character properly matches */
151#ifdef DEBUGGING
152# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
153 property_name, \
154 utf8_char_in_property) \
155 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
156 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
157#else
158# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
159 property_name, \
160 utf8_char_in_property) \
161 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
162#endif
d1eb3177 163
c7304fe2
KW
164#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
165 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
166 swash_property_names[_CC_WORDCHAR], \
167 GREEK_SMALL_LETTER_IOTA_UTF8)
168
169#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
170 STMT_START { \
171 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
172 "_X_regular_begin", \
173 GREEK_SMALL_LETTER_IOTA_UTF8); \
174 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
175 "_X_extend", \
176 COMBINING_GRAVE_ACCENT_UTF8); \
177 } STMT_END
d1eb3177 178
c7304fe2 179#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
180/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
181
5f80c4cf 182/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
183/* it would be nice to rework regcomp.sym to generate this stuff. sigh
184 *
185 * NOTE that *nothing* that affects backtracking should be in here, specifically
186 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
187 * node that is in between two EXACT like nodes when ascertaining what the required
188 * "follow" character is. This should probably be moved to regex compile time
189 * although it may be done at run time beause of the REF possibility - more
190 * investigation required. -- demerphq
191*/
3e901dc0
YO
192#define JUMPABLE(rn) ( \
193 OP(rn) == OPEN || \
194 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
195 OP(rn) == EVAL || \
cca55fe3
JP
196 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
197 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 198 OP(rn) == KEEPS || \
3dab1dad 199 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 200)
ee9b8eae 201#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 202
ee9b8eae
YO
203#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
204
205#if 0
206/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
207 we don't need this definition. */
208#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 209#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
210#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
211
212#else
213/* ... so we use this as its faster. */
214#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 215#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
216#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
217#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
218
219#endif
e2d8ce26 220
a84d97b6
HS
221/*
222 Search for mandatory following text node; for lookahead, the text must
223 follow but for lookbehind (rn->flags != 0) we skip to the next step.
224*/
cca55fe3 225#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
226 while (JUMPABLE(rn)) { \
227 const OPCODE type = OP(rn); \
228 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 229 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 230 else if (type == PLUS) \
cca55fe3 231 rn = NEXTOPER(rn); \
3dab1dad 232 else if (type == IFMATCH) \
a84d97b6 233 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 234 else rn += NEXT_OFF(rn); \
3dab1dad 235 } \
5f80c4cf 236} STMT_END
74750237 237
22913b96
KW
238/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
239 * These are for the pre-composed Hangul syllables, which are all in a
240 * contiguous block and arranged there in such a way so as to facilitate
241 * alorithmic determination of their characteristics. As such, they don't need
242 * a swash, but can be determined by simple arithmetic. Almost all are
243 * GCB=LVT, but every 28th one is a GCB=LV */
244#define SBASE 0xAC00 /* Start of block */
245#define SCount 11172 /* Length of block */
246#define TCount 28
c476f425 247
acfe0abc 248static void restore_pos(pTHX_ void *arg);
51371543 249
87c0511b 250#define REGCP_PAREN_ELEMS 3
f067efbf 251#define REGCP_OTHER_ELEMS 3
e0fa7e2b 252#define REGCP_FRAME_ELEMS 1
620d5b66
NC
253/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
254 * are needed for the regexp context stack bookkeeping. */
255
76e3520e 256STATIC CHECKPOINT
92da3157 257S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 258{
97aff369 259 dVAR;
a3b680e6 260 const int retval = PL_savestack_ix;
92da3157
DM
261 const int paren_elems_to_push =
262 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
263 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
264 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 265 I32 p;
40a82448 266 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 267
b93070ed
DM
268 PERL_ARGS_ASSERT_REGCPPUSH;
269
e49a9654 270 if (paren_elems_to_push < 0)
5637ef5b
NC
271 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
272 paren_elems_to_push);
e49a9654 273
e0fa7e2b
NC
274 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
275 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 276 " out of range (%lu-%ld)",
92da3157
DM
277 total_elems,
278 (unsigned long)maxopenparen,
279 (long)parenfloor);
e0fa7e2b 280
620d5b66 281 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 282
495f47a5 283 DEBUG_BUFFERS_r(
92da3157 284 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
285 PerlIO_printf(Perl_debug_log,
286 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
287 PTR2UV(rex),
288 PTR2UV(rex->offs)
289 );
290 );
92da3157 291 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 292/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
293 SSPUSHINT(rex->offs[p].end);
294 SSPUSHINT(rex->offs[p].start);
1ca2007e 295 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 296 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
297 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
298 (UV)p,
299 (IV)rex->offs[p].start,
300 (IV)rex->offs[p].start_tmp,
301 (IV)rex->offs[p].end
40a82448 302 ));
a0d0e21e 303 }
b1ce53c5 304/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 305 SSPUSHINT(maxopenparen);
b93070ed
DM
306 SSPUSHINT(rex->lastparen);
307 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 308 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 309
a0d0e21e
LW
310 return retval;
311}
312
c277df42 313/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
314#define REGCP_SET(cp) \
315 DEBUG_STATE_r( \
ab3bbdeb 316 PerlIO_printf(Perl_debug_log, \
e4f74956 317 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
318 (IV)PL_savestack_ix)); \
319 cp = PL_savestack_ix
c3464db5 320
ab3bbdeb 321#define REGCP_UNWIND(cp) \
e4f74956 322 DEBUG_STATE_r( \
ab3bbdeb 323 if (cp != PL_savestack_ix) \
e4f74956
YO
324 PerlIO_printf(Perl_debug_log, \
325 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
326 (IV)(cp), (IV)PL_savestack_ix)); \
327 regcpblow(cp)
c277df42 328
a8d1f4b4
DM
329#define UNWIND_PAREN(lp, lcp) \
330 for (n = rex->lastparen; n > lp; n--) \
331 rex->offs[n].end = -1; \
332 rex->lastparen = n; \
333 rex->lastcloseparen = lcp;
334
335
f067efbf 336STATIC void
92da3157 337S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 338{
97aff369 339 dVAR;
e0fa7e2b 340 UV i;
87c0511b 341 U32 paren;
a3621e74
YO
342 GET_RE_DEBUG_FLAGS_DECL;
343
7918f24d
NC
344 PERL_ARGS_ASSERT_REGCPPOP;
345
b1ce53c5 346 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 347 i = SSPOPUV;
e0fa7e2b
NC
348 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
349 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
350 rex->lastcloseparen = SSPOPINT;
351 rex->lastparen = SSPOPINT;
92da3157 352 *maxopenparen_p = SSPOPINT;
b1ce53c5 353
620d5b66 354 i -= REGCP_OTHER_ELEMS;
b1ce53c5 355 /* Now restore the parentheses context. */
495f47a5
DM
356 DEBUG_BUFFERS_r(
357 if (i || rex->lastparen + 1 <= rex->nparens)
358 PerlIO_printf(Perl_debug_log,
359 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
360 PTR2UV(rex),
361 PTR2UV(rex->offs)
362 );
363 );
92da3157 364 paren = *maxopenparen_p;
620d5b66 365 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 366 I32 tmps;
1ca2007e 367 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 368 rex->offs[paren].start = SSPOPINT;
cf93c79d 369 tmps = SSPOPINT;
b93070ed
DM
370 if (paren <= rex->lastparen)
371 rex->offs[paren].end = tmps;
495f47a5
DM
372 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
373 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
374 (UV)paren,
375 (IV)rex->offs[paren].start,
376 (IV)rex->offs[paren].start_tmp,
377 (IV)rex->offs[paren].end,
378 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 379 );
87c0511b 380 paren--;
a0d0e21e 381 }
daf18116 382#if 1
dafc8851
JH
383 /* It would seem that the similar code in regtry()
384 * already takes care of this, and in fact it is in
385 * a better location to since this code can #if 0-ed out
386 * but the code in regtry() is needed or otherwise tests
387 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
388 * (as of patchlevel 7877) will fail. Then again,
389 * this code seems to be necessary or otherwise
225593e1
DM
390 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
391 * --jhi updated by dapm */
b93070ed 392 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 393 if (i > *maxopenparen_p)
b93070ed
DM
394 rex->offs[i].start = -1;
395 rex->offs[i].end = -1;
495f47a5
DM
396 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
397 " \\%"UVuf": %s ..-1 undeffing\n",
398 (UV)i,
92da3157 399 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 400 ));
a0d0e21e 401 }
dafc8851 402#endif
a0d0e21e
LW
403}
404
74088413
DM
405/* restore the parens and associated vars at savestack position ix,
406 * but without popping the stack */
407
408STATIC void
92da3157 409S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
410{
411 I32 tmpix = PL_savestack_ix;
412 PL_savestack_ix = ix;
92da3157 413 regcppop(rex, maxopenparen_p);
74088413
DM
414 PL_savestack_ix = tmpix;
415}
416
02db2b7b 417#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 418
31c7f561
KW
419STATIC bool
420S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
421{
422 /* Returns a boolean as to whether or not 'character' is a member of the
423 * Posix character class given by 'classnum' that should be equivalent to a
424 * value in the typedef '_char_class_number'.
425 *
426 * Ideally this could be replaced by a just an array of function pointers
427 * to the C library functions that implement the macros this calls.
428 * However, to compile, the precise function signatures are required, and
429 * these may vary from platform to to platform. To avoid having to figure
430 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
431 * which adds an extra layer of function call overhead (unless the C
432 * optimizer strips it away). But we don't particularly care about
433 * performance with locales anyway. */
31c7f561
KW
434
435 switch ((_char_class_number) classnum) {
15861f94 436 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 437 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
438 case _CC_ENUM_ASCII: return isASCII_LC(character);
439 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
440 case _CC_ENUM_CASED: return isLOWER_LC(character)
441 || isUPPER_LC(character);
e8d596e0 442 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
443 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
444 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
445 case _CC_ENUM_LOWER: return isLOWER_LC(character);
446 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 447 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 448 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 449 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
450 case _CC_ENUM_UPPER: return isUPPER_LC(character);
451 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 452 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
453 default: /* VERTSPACE should never occur in locales */
454 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
455 }
456
457 assert(0); /* NOTREACHED */
458 return FALSE;
459}
460
3018b823
KW
461STATIC bool
462S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
463{
464 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
465 * 'character' is a member of the Posix character class given by 'classnum'
466 * that should be equivalent to a value in the typedef
467 * '_char_class_number'.
468 *
469 * This just calls isFOO_lc on the code point for the character if it is in
470 * the range 0-255. Outside that range, all characters avoid Unicode
471 * rules, ignoring any locale. So use the Unicode function if this class
472 * requires a swash, and use the Unicode macro otherwise. */
473
474 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
475
476 if (UTF8_IS_INVARIANT(*character)) {
477 return isFOO_lc(classnum, *character);
478 }
479 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
480 return isFOO_lc(classnum,
481 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
482 }
483
484 if (classnum < _FIRST_NON_SWASH_CC) {
485
486 /* Initialize the swash unless done already */
487 if (! PL_utf8_swash_ptrs[classnum]) {
488 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
489 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
490 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
491 }
492
92a2046b
KW
493 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
494 character,
495 TRUE /* is UTF */ ));
3018b823
KW
496 }
497
498 switch ((_char_class_number) classnum) {
499 case _CC_ENUM_SPACE:
500 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
501
502 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
503 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
504 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
505 default: return 0; /* Things like CNTRL are always
506 below 256 */
507 }
508
509 assert(0); /* NOTREACHED */
510 return FALSE;
511}
512
a687059c 513/*
e50aee73 514 * pregexec and friends
a687059c
LW
515 */
516
76234dfb 517#ifndef PERL_IN_XSUB_RE
a687059c 518/*
c277df42 519 - pregexec - match a regexp against a string
a687059c 520 */
c277df42 521I32
5aaab254 522Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 523 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
524/* stringarg: the point in the string at which to begin matching */
525/* strend: pointer to null at end of string */
526/* strbeg: real beginning of string */
527/* minend: end of match must be >= minend bytes after stringarg. */
528/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
529 * itself is accessed via the pointers above */
530/* nosave: For optimizations. */
c277df42 531{
7918f24d
NC
532 PERL_ARGS_ASSERT_PREGEXEC;
533
c277df42 534 return
9041c2e3 535 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
536 nosave ? 0 : REXEC_COPY_STR);
537}
76234dfb 538#endif
22e551b9 539
9041c2e3 540/*
cad2e5aa
JH
541 * Need to implement the following flags for reg_anch:
542 *
543 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
544 * USE_INTUIT_ML
545 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
546 * INTUIT_AUTORITATIVE_ML
547 * INTUIT_ONCE_NOML - Intuit can match in one location only.
548 * INTUIT_ONCE_ML
549 *
550 * Another flag for this function: SECOND_TIME (so that float substrs
551 * with giant delta may be not rechecked).
552 */
553
554/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
555
3f7c398e 556/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
557 Otherwise, only SvCUR(sv) is used to get strbeg. */
558
559/* XXXX We assume that strpos is strbeg unless sv. */
560
6eb5f6b9
JH
561/* XXXX Some places assume that there is a fixed substring.
562 An update may be needed if optimizer marks as "INTUITable"
563 RExen without fixed substrings. Similarly, it is assumed that
564 lengths of all the strings are no more than minlen, thus they
565 cannot come from lookahead.
40d049e4
YO
566 (Or minlen should take into account lookahead.)
567 NOTE: Some of this comment is not correct. minlen does now take account
568 of lookahead/behind. Further research is required. -- demerphq
569
570*/
6eb5f6b9 571
2c2d71f5
JH
572/* A failure to find a constant substring means that there is no need to make
573 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 574 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
575 regtry() should be needed.
576
577 REx compiler's optimizer found 4 possible hints:
578 a) Anchored substring;
579 b) Fixed substring;
580 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 581 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 582 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
583 string which does not contradict any of them.
584 */
2c2d71f5 585
6eb5f6b9
JH
586/* Most of decisions we do here should have been done at compile time.
587 The nodes of the REx which we used for the search should have been
588 deleted from the finite automaton. */
589
cad2e5aa 590char *
288b8c02 591Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 592 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 593{
97aff369 594 dVAR;
8d919b0a 595 struct regexp *const prog = ReANY(rx);
eb578fdb 596 I32 start_shift = 0;
cad2e5aa 597 /* Should be nonnegative! */
eb578fdb
KW
598 I32 end_shift = 0;
599 char *s;
600 SV *check;
a1933d95 601 char *strbeg;
cad2e5aa 602 char *t;
f2ed9b32 603 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 604 I32 ml_anch;
eb578fdb 605 char *other_last = NULL; /* other substr checked before this */
bd61b366 606 char *check_at = NULL; /* check substr found at this pos */
d8080198 607 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 608 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 609 RXi_GET_DECL(prog,progi);
984e6dd1 610 bool is_utf8_pat;
30944b6d 611#ifdef DEBUGGING
890ce7af 612 const char * const i_strpos = strpos;
30944b6d 613#endif
a3621e74
YO
614 GET_RE_DEBUG_FLAGS_DECL;
615
7918f24d 616 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
617 PERL_UNUSED_ARG(flags);
618 PERL_UNUSED_ARG(data);
7918f24d 619
f2ed9b32 620 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 621
984e6dd1 622 is_utf8_pat = cBOOL(RX_UTF8(rx));
4fab19ce 623
ab3bbdeb 624 DEBUG_EXECUTE_r(
f2ed9b32 625 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
626 sv ? "Guessing start of match in sv for"
627 : "Guessing start of match in string for");
2a782b5b 628 );
cad2e5aa 629
c344f387
JH
630 /* CHR_DIST() would be more correct here but it makes things slow. */
631 if (prog->minlen > strend - strpos) {
a3621e74 632 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 633 "String too short... [re_intuit_start]\n"));
cad2e5aa 634 goto fail;
2c2d71f5 635 }
d8da0584 636
7016d6eb
DM
637 /* XXX we need to pass strbeg as a separate arg: the following is
638 * guesswork and can be wrong... */
639 if (sv && SvPOK(sv)) {
640 char * p = SvPVX(sv);
641 STRLEN cur = SvCUR(sv);
642 if (p <= strpos && strpos < p + cur) {
643 strbeg = p;
644 assert(p <= strend && strend <= p + cur);
645 }
646 else
647 strbeg = strend - cur;
648 }
649 else
650 strbeg = strpos;
651
1aa99e6b 652 PL_regeol = strend;
f2ed9b32 653 if (utf8_target) {
33b8afdf
JH
654 if (!prog->check_utf8 && prog->check_substr)
655 to_utf8_substr(prog);
656 check = prog->check_utf8;
657 } else {
7e0d5ad7
KW
658 if (!prog->check_substr && prog->check_utf8) {
659 if (! to_byte_substr(prog)) {
6b54ddc5 660 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
661 }
662 }
33b8afdf
JH
663 check = prog->check_substr;
664 }
bbe252da
YO
665 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
666 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
667 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 668 && !multiline ) ); /* Check after \n? */
cad2e5aa 669
7e25d62c 670 if (!ml_anch) {
bbe252da
YO
671 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
672 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 673 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
674 && sv && !SvROK(sv)
675 && (strpos != strbeg)) {
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
677 goto fail;
678 }
d46b78c6
KW
679 if (prog->check_offset_min == prog->check_offset_max
680 && !(prog->extflags & RXf_CANY_SEEN)
681 && ! multiline) /* /m can cause \n's to match that aren't
682 accounted for in the string max length.
683 See [perl #115242] */
684 {
2c2d71f5 685 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
686 I32 slen;
687
1aa99e6b 688 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 689
653099ff
GS
690 if (SvTAIL(check)) {
691 slen = SvCUR(check); /* >= 1 */
cad2e5aa 692
9041c2e3 693 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 694 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 695 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 696 goto fail_finish;
cad2e5aa
JH
697 }
698 /* Now should match s[0..slen-2] */
699 slen--;
3f7c398e 700 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 701 || (slen > 1
3f7c398e 702 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 703 report_neq:
a3621e74 704 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
705 goto fail_finish;
706 }
cad2e5aa 707 }
3f7c398e 708 else if (*SvPVX_const(check) != *s
653099ff 709 || ((slen = SvCUR(check)) > 1
3f7c398e 710 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 711 goto report_neq;
c315bfe8 712 check_at = s;
2c2d71f5 713 goto success_at_start;
7e25d62c 714 }
cad2e5aa 715 }
2c2d71f5 716 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 717 s = strpos;
2c2d71f5 718 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
719 end_shift = prog->check_end_shift;
720
2c2d71f5 721 if (!ml_anch) {
a3b680e6 722 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 723 - (SvTAIL(check) != 0);
a3b680e6 724 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
725
726 if (end_shift < eshift)
727 end_shift = eshift;
728 }
cad2e5aa 729 }
2c2d71f5 730 else { /* Can match at random position */
cad2e5aa
JH
731 ml_anch = 0;
732 s = strpos;
1de06328
YO
733 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
734 end_shift = prog->check_end_shift;
735
736 /* end shift should be non negative here */
cad2e5aa
JH
737 }
738
bcdf7404 739#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 740 if (end_shift < 0)
1de06328 741 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 742 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
743#endif
744
2c2d71f5
JH
745 restart:
746 /* Find a possible match in the region s..strend by looking for
747 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
748
749 {
750 I32 srch_start_shift = start_shift;
751 I32 srch_end_shift = end_shift;
c33e64f0
FC
752 U8* start_point;
753 U8* end_point;
1de06328
YO
754 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
755 srch_end_shift -= ((strbeg - s) - srch_start_shift);
756 srch_start_shift = strbeg - s;
757 }
6bda09f9 758 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
759 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
760 (IV)prog->check_offset_min,
761 (IV)srch_start_shift,
762 (IV)srch_end_shift,
763 (IV)prog->check_end_shift);
764 });
765
bbe252da 766 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
767 start_point= (U8*)(s + srch_start_shift);
768 end_point= (U8*)(strend - srch_end_shift);
769 } else {
770 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
771 end_point= HOP3(strend, -srch_end_shift, strbeg);
772 }
6bda09f9 773 DEBUG_OPTIMISE_MORE_r({
56570a2c 774 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 775 (int)(end_point - start_point),
fc8cd66c 776 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
777 start_point);
778 });
779
780 s = fbm_instr( start_point, end_point,
7fba1cd6 781 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 782 }
cad2e5aa
JH
783 /* Update the count-of-usability, remove useless subpatterns,
784 unshift s. */
2c2d71f5 785
ab3bbdeb 786 DEBUG_EXECUTE_r({
f2ed9b32 787 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
788 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
789 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 790 (s ? "Found" : "Did not find"),
f2ed9b32 791 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
792 ? "anchored" : "floating"),
793 quoted,
794 RE_SV_TAIL(check),
795 (s ? " at offset " : "...\n") );
796 });
2c2d71f5
JH
797
798 if (!s)
799 goto fail_finish;
2c2d71f5 800 /* Finish the diagnostic message */
a3621e74 801 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 802
1de06328
YO
803 /* XXX dmq: first branch is for positive lookbehind...
804 Our check string is offset from the beginning of the pattern.
805 So we need to do any stclass tests offset forward from that
806 point. I think. :-(
807 */
808
809
810
811 check_at=s;
812
813
2c2d71f5
JH
814 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
815 Start with the other substr.
816 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 817 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
818 *always* match. Probably should be marked during compile...
819 Probably it is right to do no SCREAM here...
820 */
821
f2ed9b32 822 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
823 : (prog->float_substr && prog->anchored_substr))
824 {
30944b6d 825 /* Take into account the "other" substring. */
2c2d71f5
JH
826 /* XXXX May be hopelessly wrong for UTF... */
827 if (!other_last)
6eb5f6b9 828 other_last = strpos;
f2ed9b32 829 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
830 do_other_anchored:
831 {
890ce7af
AL
832 char * const last = HOP3c(s, -start_shift, strbeg);
833 char *last1, *last2;
be8e71aa 834 char * const saved_s = s;
33b8afdf 835 SV* must;
2c2d71f5 836
2c2d71f5
JH
837 t = s - prog->check_offset_max;
838 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 839 && (!utf8_target
0ce71af7 840 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 841 && t > strpos)))
6f207bd3 842 NOOP;
2c2d71f5
JH
843 else
844 t = strpos;
1aa99e6b 845 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
846 if (t < other_last) /* These positions already checked */
847 t = other_last;
1aa99e6b 848 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
849 if (last < last1)
850 last1 = last;
1de06328
YO
851 /* XXXX It is not documented what units *_offsets are in.
852 We assume bytes, but this is clearly wrong.
853 Meaning this code needs to be carefully reviewed for errors.
854 dmq.
855 */
856
2c2d71f5 857 /* On end-of-str: see comment below. */
f2ed9b32 858 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
859 if (must == &PL_sv_undef) {
860 s = (char*)NULL;
1de06328 861 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
862 }
863 else
864 s = fbm_instr(
865 (unsigned char*)t,
866 HOP3(HOP3(last1, prog->anchored_offset, strend)
867 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
868 must,
7fba1cd6 869 multiline ? FBMrf_MULTILINE : 0
33b8afdf 870 );
ab3bbdeb 871 DEBUG_EXECUTE_r({
f2ed9b32 872 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
873 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
874 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 875 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
876 quoted, RE_SV_TAIL(must));
877 });
878
879
2c2d71f5
JH
880 if (!s) {
881 if (last1 >= last2) {
a3621e74 882 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
883 ", giving up...\n"));
884 goto fail_finish;
885 }
a3621e74 886 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 887 ", trying floating at offset %ld...\n",
be8e71aa 888 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
889 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
890 s = HOP3c(last, 1, strend);
2c2d71f5
JH
891 goto restart;
892 }
893 else {
a3621e74 894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 895 (long)(s - i_strpos)));
1aa99e6b
IH
896 t = HOP3c(s, -prog->anchored_offset, strbeg);
897 other_last = HOP3c(s, 1, strend);
be8e71aa 898 s = saved_s;
2c2d71f5
JH
899 if (t == strpos)
900 goto try_at_start;
2c2d71f5
JH
901 goto try_at_offset;
902 }
30944b6d 903 }
2c2d71f5
JH
904 }
905 else { /* Take into account the floating substring. */
33b8afdf 906 char *last, *last1;
be8e71aa 907 char * const saved_s = s;
33b8afdf
JH
908 SV* must;
909
910 t = HOP3c(s, -start_shift, strbeg);
911 last1 = last =
912 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
913 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
914 last = HOP3c(t, prog->float_max_offset, strend);
915 s = HOP3c(t, prog->float_min_offset, strend);
916 if (s < other_last)
917 s = other_last;
2c2d71f5 918 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 919 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
920 /* fbm_instr() takes into account exact value of end-of-str
921 if the check is SvTAIL(ed). Since false positives are OK,
922 and end-of-str is not later than strend we are OK. */
923 if (must == &PL_sv_undef) {
924 s = (char*)NULL;
1de06328 925 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
926 }
927 else
2c2d71f5 928 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
929 (unsigned char*)last + SvCUR(must)
930 - (SvTAIL(must)!=0),
7fba1cd6 931 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 932 DEBUG_EXECUTE_r({
f2ed9b32 933 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
934 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
935 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 936 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
937 quoted, RE_SV_TAIL(must));
938 });
33b8afdf
JH
939 if (!s) {
940 if (last1 == last) {
a3621e74 941 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
942 ", giving up...\n"));
943 goto fail_finish;
2c2d71f5 944 }
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 946 ", trying anchored starting at offset %ld...\n",
be8e71aa 947 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
948 other_last = last;
949 s = HOP3c(t, 1, strend);
950 goto restart;
951 }
952 else {
a3621e74 953 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
954 (long)(s - i_strpos)));
955 other_last = s; /* Fix this later. --Hugo */
be8e71aa 956 s = saved_s;
33b8afdf
JH
957 if (t == strpos)
958 goto try_at_start;
959 goto try_at_offset;
960 }
2c2d71f5 961 }
cad2e5aa 962 }
2c2d71f5 963
1de06328 964
9ef43ace 965 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 966
6bda09f9 967 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
968 PerlIO_printf(Perl_debug_log,
969 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
970 (IV)prog->check_offset_min,
971 (IV)prog->check_offset_max,
972 (IV)(s-strpos),
973 (IV)(t-strpos),
974 (IV)(t-s),
975 (IV)(strend-strpos)
976 )
977 );
978
2c2d71f5 979 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 980 && (!utf8_target
9ef43ace 981 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
982 && t > strpos)))
983 {
2c2d71f5
JH
984 /* Fixed substring is found far enough so that the match
985 cannot start at strpos. */
986 try_at_offset:
cad2e5aa 987 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
988 /* Eventually fbm_*() should handle this, but often
989 anchored_offset is not 0, so this check will not be wasted. */
990 /* XXXX In the code below we prefer to look for "^" even in
991 presence of anchored substrings. And we search even
992 beyond the found float position. These pessimizations
993 are historical artefacts only. */
994 find_anchor:
2c2d71f5 995 while (t < strend - prog->minlen) {
cad2e5aa 996 if (*t == '\n') {
4ee3650e 997 if (t < check_at - prog->check_offset_min) {
f2ed9b32 998 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
999 /* Since we moved from the found position,
1000 we definitely contradict the found anchored
30944b6d
IZ
1001 substr. Due to the above check we do not
1002 contradict "check" substr.
1003 Thus we can arrive here only if check substr
1004 is float. Redo checking for "other"=="fixed".
1005 */
9041c2e3 1006 strpos = t + 1;
a3621e74 1007 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1008 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1009 goto do_other_anchored;
1010 }
4ee3650e
GS
1011 /* We don't contradict the found floating substring. */
1012 /* XXXX Why not check for STCLASS? */
cad2e5aa 1013 s = t + 1;
a3621e74 1014 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1015 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1016 goto set_useful;
1017 }
4ee3650e
GS
1018 /* Position contradicts check-string */
1019 /* XXXX probably better to look for check-string
1020 than for "\n", so one should lower the limit for t? */
a3621e74 1021 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1022 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1023 other_last = strpos = s = t + 1;
cad2e5aa
JH
1024 goto restart;
1025 }
1026 t++;
1027 }
a3621e74 1028 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1029 PL_colors[0], PL_colors[1]));
2c2d71f5 1030 goto fail_finish;
cad2e5aa 1031 }
f5952150 1032 else {
a3621e74 1033 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1034 PL_colors[0], PL_colors[1]));
f5952150 1035 }
cad2e5aa
JH
1036 s = t;
1037 set_useful:
f2ed9b32 1038 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1039 }
1040 else {
f5952150 1041 /* The found string does not prohibit matching at strpos,
2c2d71f5 1042 - no optimization of calling REx engine can be performed,
f5952150
GS
1043 unless it was an MBOL and we are not after MBOL,
1044 or a future STCLASS check will fail this. */
2c2d71f5
JH
1045 try_at_start:
1046 /* Even in this situation we may use MBOL flag if strpos is offset
1047 wrt the start of the string. */
05b4157f 1048 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1049 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1050 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1051 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1052 {
cad2e5aa
JH
1053 t = strpos;
1054 goto find_anchor;
1055 }
a3621e74 1056 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1057 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1058 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1059 );
2c2d71f5 1060 success_at_start:
bbe252da 1061 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1062 && (utf8_target ? (
33b8afdf
JH
1063 prog->check_utf8 /* Could be deleted already */
1064 && --BmUSEFUL(prog->check_utf8) < 0
1065 && (prog->check_utf8 == prog->float_utf8)
1066 ) : (
1067 prog->check_substr /* Could be deleted already */
1068 && --BmUSEFUL(prog->check_substr) < 0
1069 && (prog->check_substr == prog->float_substr)
1070 )))
66e933ab 1071 {
cad2e5aa 1072 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1073 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1074 /* XXX Does the destruction order has to change with utf8_target? */
1075 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1076 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1077 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1078 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1079 check = NULL; /* abort */
cad2e5aa 1080 s = strpos;
486ec47a 1081 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1082 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1083 if (prog->intflags & PREGf_IMPLICIT)
1084 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1085 /* XXXX This is a remnant of the old implementation. It
1086 looks wasteful, since now INTUIT can use many
6eb5f6b9 1087 other heuristics. */
bbe252da 1088 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1089 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1090 }
1091 else
1092 s = strpos;
1093 }
1094
6eb5f6b9
JH
1095 /* Last resort... */
1096 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1097 /* trie stclasses are too expensive to use here, we are better off to
1098 leave it to regmatch itself */
f8fc2ecf 1099 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1100 /* minlen == 0 is possible if regstclass is \b or \B,
1101 and the fixed substr is ''$.
1102 Since minlen is already taken into account, s+1 is before strend;
1103 accidentally, minlen >= 1 guaranties no false positives at s + 1
1104 even for \b or \B. But (minlen? 1 : 0) below assumes that
1105 regstclass does not come from lookahead... */
1106 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1107 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1108 const U8* const str = (U8*)STRING(progi->regstclass);
1109 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1110 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1111 : 1);
1de06328
YO
1112 char * endpos;
1113 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1114 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1115 else if (prog->float_substr || prog->float_utf8)
1116 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1117 else
1118 endpos= strend;
1119
d8080198
YO
1120 if (checked_upto < s)
1121 checked_upto = s;
1122 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1123 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1124
6eb5f6b9 1125 t = s;
984e6dd1
DM
1126 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1127 NULL, is_utf8_pat);
d8080198
YO
1128 if (s) {
1129 checked_upto = s;
1130 } else {
6eb5f6b9 1131#ifdef DEBUGGING
cbbf8932 1132 const char *what = NULL;
6eb5f6b9
JH
1133#endif
1134 if (endpos == strend) {
a3621e74 1135 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1136 "Could not match STCLASS...\n") );
1137 goto fail;
1138 }
a3621e74 1139 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1140 "This position contradicts STCLASS...\n") );
bbe252da 1141 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1142 goto fail;
d8080198
YO
1143 checked_upto = HOPBACKc(endpos, start_shift);
1144 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1145 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1146 /* Contradict one of substrings */
33b8afdf 1147 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1148 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1149 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1150 hop_and_restart:
1aa99e6b 1151 s = HOP3c(t, 1, strend);
66e933ab
GS
1152 if (s + start_shift + end_shift > strend) {
1153 /* XXXX Should be taken into account earlier? */
a3621e74 1154 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1155 "Could not match STCLASS...\n") );
1156 goto fail;
1157 }
5e39e1e5
HS
1158 if (!check)
1159 goto giveup;
a3621e74 1160 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1161 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1162 what, (long)(s + start_shift - i_strpos)) );
1163 goto restart;
1164 }
66e933ab 1165 /* Have both, check_string is floating */
6eb5f6b9
JH
1166 if (t + start_shift >= check_at) /* Contradicts floating=check */
1167 goto retry_floating_check;
1168 /* Recheck anchored substring, but not floating... */
9041c2e3 1169 s = check_at;
5e39e1e5
HS
1170 if (!check)
1171 goto giveup;
a3621e74 1172 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1173 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1174 (long)(other_last - i_strpos)) );
1175 goto do_other_anchored;
1176 }
60e71179
GS
1177 /* Another way we could have checked stclass at the
1178 current position only: */
1179 if (ml_anch) {
1180 s = t = t + 1;
5e39e1e5
HS
1181 if (!check)
1182 goto giveup;
a3621e74 1183 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1184 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1185 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1186 goto try_at_offset;
66e933ab 1187 }
f2ed9b32 1188 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1189 goto fail;
486ec47a 1190 /* Check is floating substring. */
6eb5f6b9
JH
1191 retry_floating_check:
1192 t = check_at - start_shift;
a3621e74 1193 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1194 goto hop_and_restart;
1195 }
b7953727 1196 if (t != s) {
a3621e74 1197 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1198 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1199 (long)(t - i_strpos), (long)(s - i_strpos))
1200 );
1201 }
1202 else {
a3621e74 1203 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1204 "Does not contradict STCLASS...\n");
1205 );
1206 }
6eb5f6b9 1207 }
5e39e1e5 1208 giveup:
a3621e74 1209 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1210 PL_colors[4], (check ? "Guessed" : "Giving up"),
1211 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1212 return s;
2c2d71f5
JH
1213
1214 fail_finish: /* Substring not found */
33b8afdf 1215 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1216 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1217 fail:
a3621e74 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1219 PL_colors[4], PL_colors[5]));
bd61b366 1220 return NULL;
cad2e5aa 1221}
9661b544 1222
a0a388a1
YO
1223#define DECL_TRIE_TYPE(scan) \
1224 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1225 trie_type = ((scan->flags == EXACT) \
1226 ? (utf8_target ? trie_utf8 : trie_plain) \
1227 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1228
fd3249ee
YO
1229#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1230STMT_START { \
fab2782b
YO
1231 STRLEN skiplen; \
1232 switch (trie_type) { \
1233 case trie_utf8_fold: \
1234 if ( foldlen>0 ) { \
1235 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1236 foldlen -= len; \
1237 uscan += len; \
1238 len=0; \
1239 } else { \
1240 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1241 len = UTF8SKIP(uc); \
1242 skiplen = UNISKIP( uvc ); \
1243 foldlen -= skiplen; \
1244 uscan = foldbuf + skiplen; \
1245 } \
1246 break; \
1247 case trie_latin_utf8_fold: \
1248 if ( foldlen>0 ) { \
1249 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1250 foldlen -= len; \
1251 uscan += len; \
1252 len=0; \
1253 } else { \
1254 len = 1; \
1255 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1256 skiplen = UNISKIP( uvc ); \
1257 foldlen -= skiplen; \
1258 uscan = foldbuf + skiplen; \
1259 } \
1260 break; \
1261 case trie_utf8: \
1262 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1263 break; \
1264 case trie_plain: \
1265 uvc = (UV)*uc; \
1266 len = 1; \
1267 } \
1268 if (uvc < 256) { \
1269 charid = trie->charmap[ uvc ]; \
1270 } \
1271 else { \
1272 charid = 0; \
1273 if (widecharmap) { \
1274 SV** const svpp = hv_fetch(widecharmap, \
1275 (char*)&uvc, sizeof(UV), 0); \
1276 if (svpp) \
1277 charid = (U16)SvIV(*svpp); \
1278 } \
1279 } \
4cadc6a9
YO
1280} STMT_END
1281
4cadc6a9
YO
1282#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1283STMT_START { \
1284 while (s <= e) { \
1285 if ( (CoNd) \
fac1af77 1286 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1287 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1288 goto got_it; \
1289 s++; \
1290 } \
1291} STMT_END
1292
1293#define REXEC_FBC_UTF8_SCAN(CoDe) \
1294STMT_START { \
9a902117 1295 while (s < strend) { \
4cadc6a9 1296 CoDe \
9a902117 1297 s += UTF8SKIP(s); \
4cadc6a9
YO
1298 } \
1299} STMT_END
1300
1301#define REXEC_FBC_SCAN(CoDe) \
1302STMT_START { \
1303 while (s < strend) { \
1304 CoDe \
1305 s++; \
1306 } \
1307} STMT_END
1308
1309#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1310REXEC_FBC_UTF8_SCAN( \
1311 if (CoNd) { \
7aee35ff 1312 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1313 goto got_it; \
1314 else \
1315 tmp = doevery; \
1316 } \
1317 else \
1318 tmp = 1; \
1319)
1320
1321#define REXEC_FBC_CLASS_SCAN(CoNd) \
1322REXEC_FBC_SCAN( \
1323 if (CoNd) { \
24b23f37 1324 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1325 goto got_it; \
1326 else \
1327 tmp = doevery; \
1328 } \
1329 else \
1330 tmp = 1; \
1331)
1332
1333#define REXEC_FBC_TRYIT \
24b23f37 1334if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1335 goto got_it
1336
e1d1eefb 1337#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1338 if (utf8_target) { \
e1d1eefb
YO
1339 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1340 } \
1341 else { \
1342 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1343 }
e1d1eefb 1344
786e8c11
YO
1345#define DUMP_EXEC_POS(li,s,doutf8) \
1346 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1347
cfaf538b
KW
1348
1349#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1350 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1351 tmp = TEST_NON_UTF8(tmp); \
1352 REXEC_FBC_UTF8_SCAN( \
1353 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1354 tmp = !tmp; \
1355 IF_SUCCESS; \
1356 } \
1357 else { \
1358 IF_FAIL; \
1359 } \
1360 ); \
1361
1362#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1363 if (s == PL_bostr) { \
1364 tmp = '\n'; \
1365 } \
1366 else { \
1367 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1368 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1369 } \
1370 tmp = TeSt1_UtF8; \
1371 LOAD_UTF8_CHARCLASS_ALNUM(); \
1372 REXEC_FBC_UTF8_SCAN( \
1373 if (tmp == ! (TeSt2_UtF8)) { \
1374 tmp = !tmp; \
1375 IF_SUCCESS; \
1376 } \
1377 else { \
1378 IF_FAIL; \
1379 } \
1380 ); \
1381
63ac0dad
KW
1382/* The only difference between the BOUND and NBOUND cases is that
1383 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1384 * NBOUND. This is accomplished by passing it in either the if or else clause,
1385 * with the other one being empty */
1386#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1387 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1388
1389#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1390 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1391
1392#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1393 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1394
1395#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1396 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1397
63ac0dad
KW
1398
1399/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1400 * be passed in completely with the variable name being tested, which isn't
1401 * such a clean interface, but this is easier to read than it was before. We
1402 * are looking for the boundary (or non-boundary between a word and non-word
1403 * character. The utf8 and non-utf8 cases have the same logic, but the details
1404 * must be different. Find the "wordness" of the character just prior to this
1405 * one, and compare it with the wordness of this one. If they differ, we have
1406 * a boundary. At the beginning of the string, pretend that the previous
1407 * character was a new-line */
cfaf538b 1408#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1409 if (utf8_target) { \
cfaf538b 1410 UTF8_CODE \
63ac0dad
KW
1411 } \
1412 else { /* Not utf8 */ \
1413 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1414 tmp = TEST_NON_UTF8(tmp); \
1415 REXEC_FBC_SCAN( \
1416 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1417 tmp = !tmp; \
1418 IF_SUCCESS; \
1419 } \
1420 else { \
1421 IF_FAIL; \
1422 } \
1423 ); \
1424 } \
1425 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1426 goto got_it;
1427
786e8c11
YO
1428/* We know what class REx starts with. Try to find this position... */
1429/* if reginfo is NULL, its a dryrun */
1430/* annoyingly all the vars in this routine have different names from their counterparts
1431 in regmatch. /grrr */
1432
3c3eec57 1433STATIC char *
07be1b83 1434S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
984e6dd1 1435 const char *strend, regmatch_info *reginfo, bool is_utf8_pat)
a687059c 1436{
73104a1b
KW
1437 dVAR;
1438 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1439 char *pat_string; /* The pattern's exactish string */
1440 char *pat_end; /* ptr to end char of pat_string */
1441 re_fold_t folder; /* Function for computing non-utf8 folds */
1442 const U8 *fold_array; /* array for folding ords < 256 */
1443 STRLEN ln;
1444 STRLEN lnc;
73104a1b
KW
1445 U8 c1;
1446 U8 c2;
1447 char *e;
1448 I32 tmp = 1; /* Scratch variable? */
1449 const bool utf8_target = PL_reg_match_utf8;
1450 UV utf8_fold_flags = 0;
3018b823
KW
1451 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1452 with a result inverts that result, as 0^1 =
1453 1 and 1^1 = 0 */
1454 _char_class_number classnum;
1455
73104a1b 1456 RXi_GET_DECL(prog,progi);
2f7f8cb1 1457
73104a1b 1458 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1459
73104a1b
KW
1460 /* We know what class it must start with. */
1461 switch (OP(c)) {
1462 case ANYOF:
9aa1e39f 1463 case ANYOF_SYNTHETIC:
954a2af6 1464 case ANYOF_WARN_SUPER:
73104a1b
KW
1465 if (utf8_target) {
1466 REXEC_FBC_UTF8_CLASS_SCAN(
1467 reginclass(prog, c, (U8*)s, utf8_target));
1468 }
1469 else {
1470 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1471 }
1472 break;
1473 case CANY:
1474 REXEC_FBC_SCAN(
1475 if (tmp && (!reginfo || regtry(reginfo, &s)))
1476 goto got_it;
1477 else
1478 tmp = doevery;
1479 );
1480 break;
1481
1482 case EXACTFA:
984e6dd1 1483 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1484 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1485 goto do_exactf_utf8;
1486 }
1487 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1488 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1489 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1490
73104a1b
KW
1491 case EXACTF:
1492 if (utf8_target) {
16d951b7 1493
73104a1b
KW
1494 /* regcomp.c already folded this if pattern is in UTF-8 */
1495 utf8_fold_flags = 0;
1496 goto do_exactf_utf8;
1497 }
1498 fold_array = PL_fold;
1499 folder = foldEQ;
1500 goto do_exactf_non_utf8;
1501
1502 case EXACTFL:
984e6dd1 1503 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1504 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1505 goto do_exactf_utf8;
1506 }
1507 fold_array = PL_fold_locale;
1508 folder = foldEQ_locale;
1509 goto do_exactf_non_utf8;
3c760661 1510
73104a1b 1511 case EXACTFU_SS:
984e6dd1 1512 if (is_utf8_pat) {
73104a1b
KW
1513 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1514 }
1515 goto do_exactf_utf8;
16d951b7 1516
73104a1b
KW
1517 case EXACTFU_TRICKYFOLD:
1518 case EXACTFU:
984e6dd1
DM
1519 if (is_utf8_pat || utf8_target) {
1520 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1521 goto do_exactf_utf8;
1522 }
fac1af77 1523
73104a1b
KW
1524 /* Any 'ss' in the pattern should have been replaced by regcomp,
1525 * so we don't have to worry here about this single special case
1526 * in the Latin1 range */
1527 fold_array = PL_fold_latin1;
1528 folder = foldEQ_latin1;
1529
1530 /* FALL THROUGH */
1531
1532 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1533 are no glitches with fold-length differences
1534 between the target string and pattern */
1535
1536 /* The idea in the non-utf8 EXACTF* cases is to first find the
1537 * first character of the EXACTF* node and then, if necessary,
1538 * case-insensitively compare the full text of the node. c1 is the
1539 * first character. c2 is its fold. This logic will not work for
1540 * Unicode semantics and the german sharp ss, which hence should
1541 * not be compiled into a node that gets here. */
1542 pat_string = STRING(c);
1543 ln = STR_LEN(c); /* length to match in octets/bytes */
1544
1545 /* We know that we have to match at least 'ln' bytes (which is the
1546 * same as characters, since not utf8). If we have to match 3
1547 * characters, and there are only 2 availabe, we know without
1548 * trying that it will fail; so don't start a match past the
1549 * required minimum number from the far end */
1550 e = HOP3c(strend, -((I32)ln), s);
1551
1552 if (!reginfo && e < s) {
1553 e = s; /* Due to minlen logic of intuit() */
1554 }
fac1af77 1555
73104a1b
KW
1556 c1 = *pat_string;
1557 c2 = fold_array[c1];
1558 if (c1 == c2) { /* If char and fold are the same */
1559 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1560 }
1561 else {
1562 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1563 }
1564 break;
fac1af77 1565
73104a1b
KW
1566 do_exactf_utf8:
1567 {
1568 unsigned expansion;
1569
1570 /* If one of the operands is in utf8, we can't use the simpler folding
1571 * above, due to the fact that many different characters can have the
1572 * same fold, or portion of a fold, or different- length fold */
1573 pat_string = STRING(c);
1574 ln = STR_LEN(c); /* length to match in octets/bytes */
1575 pat_end = pat_string + ln;
984e6dd1 1576 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1577 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1578 : ln;
1579
1580 /* We have 'lnc' characters to match in the pattern, but because of
1581 * multi-character folding, each character in the target can match
1582 * up to 3 characters (Unicode guarantees it will never exceed
1583 * this) if it is utf8-encoded; and up to 2 if not (based on the
1584 * fact that the Latin 1 folds are already determined, and the
1585 * only multi-char fold in that range is the sharp-s folding to
1586 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1587 * string character. Adjust lnc accordingly, rounding up, so that
1588 * if we need to match at least 4+1/3 chars, that really is 5. */
1589 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1590 lnc = (lnc + expansion - 1) / expansion;
1591
1592 /* As in the non-UTF8 case, if we have to match 3 characters, and
1593 * only 2 are left, it's guaranteed to fail, so don't start a
1594 * match that would require us to go beyond the end of the string
1595 */
1596 e = HOP3c(strend, -((I32)lnc), s);
1597
1598 if (!reginfo && e < s) {
1599 e = s; /* Due to minlen logic of intuit() */
1600 }
0658cdde 1601
73104a1b
KW
1602 /* XXX Note that we could recalculate e to stop the loop earlier,
1603 * as the worst case expansion above will rarely be met, and as we
1604 * go along we would usually find that e moves further to the left.
1605 * This would happen only after we reached the point in the loop
1606 * where if there were no expansion we should fail. Unclear if
1607 * worth the expense */
1608
1609 while (s <= e) {
1610 char *my_strend= (char *)strend;
1611 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1612 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
73104a1b
KW
1613 && (!reginfo || regtry(reginfo, &s)) )
1614 {
1615 goto got_it;
1616 }
1617 s += (utf8_target) ? UTF8SKIP(s) : 1;
1618 }
1619 break;
1620 }
1621 case BOUNDL:
272d35c9 1622 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1623 FBC_BOUND(isWORDCHAR_LC,
1624 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1626 break;
1627 case NBOUNDL:
272d35c9 1628 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1629 FBC_NBOUND(isWORDCHAR_LC,
1630 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1632 break;
1633 case BOUND:
1634 FBC_BOUND(isWORDCHAR,
0eb30aeb 1635 isWORDCHAR_uni(tmp),
03940dc2 1636 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1637 break;
1638 case BOUNDA:
1639 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1640 isWORDCHAR_A(tmp),
1641 isWORDCHAR_A((U8*)s));
1642 break;
1643 case NBOUND:
1644 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1645 isWORDCHAR_uni(tmp),
03940dc2 1646 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1647 break;
1648 case NBOUNDA:
1649 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1650 isWORDCHAR_A(tmp),
1651 isWORDCHAR_A((U8*)s));
1652 break;
1653 case BOUNDU:
1654 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1655 isWORDCHAR_uni(tmp),
03940dc2 1656 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1657 break;
1658 case NBOUNDU:
1659 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1660 isWORDCHAR_uni(tmp),
03940dc2 1661 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1662 break;
73104a1b
KW
1663 case LNBREAK:
1664 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1665 is_LNBREAK_latin1_safe(s, strend)
1666 );
1667 break;
3018b823
KW
1668
1669 /* The argument to all the POSIX node types is the class number to pass to
1670 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1671
1672 case NPOSIXL:
1673 to_complement = 1;
1674 /* FALLTHROUGH */
1675
1676 case POSIXL:
272d35c9 1677 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1678 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1679 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1680 break;
3018b823
KW
1681
1682 case NPOSIXD:
1683 to_complement = 1;
1684 /* FALLTHROUGH */
1685
1686 case POSIXD:
1687 if (utf8_target) {
1688 goto posix_utf8;
1689 }
1690 goto posixa;
1691
1692 case NPOSIXA:
1693 if (utf8_target) {
1694 /* The complement of something that matches only ASCII matches all
1695 * UTF-8 variant code points, plus everything in ASCII that isn't
1696 * in the class */
1697 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1698 || ! _generic_isCC_A(*s, FLAGS(c)));
1699 break;
1700 }
1701
1702 to_complement = 1;
1703 /* FALLTHROUGH */
1704
73104a1b 1705 case POSIXA:
3018b823 1706 posixa:
73104a1b 1707 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1708 * byte invariant character. */
1709 REXEC_FBC_CLASS_SCAN(
1710 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1711 break;
3018b823
KW
1712
1713 case NPOSIXU:
1714 to_complement = 1;
1715 /* FALLTHROUGH */
1716
1717 case POSIXU:
1718 if (! utf8_target) {
1719 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1720 FLAGS(c))));
1721 }
1722 else {
1723
1724 posix_utf8:
1725 classnum = (_char_class_number) FLAGS(c);
1726 if (classnum < _FIRST_NON_SWASH_CC) {
1727 while (s < strend) {
1728
1729 /* We avoid loading in the swash as long as possible, but
1730 * should we have to, we jump to a separate loop. This
1731 * extra 'if' statement is what keeps this code from being
1732 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1733 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1734 goto found_above_latin1;
1735 }
1736 if ((UTF8_IS_INVARIANT(*s)
1737 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1738 classnum)))
1739 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1740 && to_complement ^ cBOOL(
1741 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1742 classnum))))
1743 {
1744 if (tmp && (!reginfo || regtry(reginfo, &s)))
1745 goto got_it;
1746 else {
1747 tmp = doevery;
1748 }
1749 }
1750 else {
1751 tmp = 1;
1752 }
1753 s += UTF8SKIP(s);
1754 }
1755 }
1756 else switch (classnum) { /* These classes are implemented as
1757 macros */
1758 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1759 revert the change of \v matching this */
1760 /* FALL THROUGH */
1761
1762 case _CC_ENUM_PSXSPC:
1763 REXEC_FBC_UTF8_CLASS_SCAN(
1764 to_complement ^ cBOOL(isSPACE_utf8(s)));
1765 break;
1766
1767 case _CC_ENUM_BLANK:
1768 REXEC_FBC_UTF8_CLASS_SCAN(
1769 to_complement ^ cBOOL(isBLANK_utf8(s)));
1770 break;
1771
1772 case _CC_ENUM_XDIGIT:
1773 REXEC_FBC_UTF8_CLASS_SCAN(
1774 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1775 break;
1776
1777 case _CC_ENUM_VERTSPACE:
1778 REXEC_FBC_UTF8_CLASS_SCAN(
1779 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1780 break;
1781
1782 case _CC_ENUM_CNTRL:
1783 REXEC_FBC_UTF8_CLASS_SCAN(
1784 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1785 break;
1786
1787 default:
1788 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1789 assert(0); /* NOTREACHED */
1790 }
1791 }
1792 break;
1793
1794 found_above_latin1: /* Here we have to load a swash to get the result
1795 for the current code point */
1796 if (! PL_utf8_swash_ptrs[classnum]) {
1797 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1798 PL_utf8_swash_ptrs[classnum] =
1799 _core_swash_init("utf8", swash_property_names[classnum],
1800 &PL_sv_undef, 1, 0, NULL, &flags);
1801 }
1802
1803 /* This is a copy of the loop above for swash classes, though using the
1804 * FBC macro instead of being expanded out. Since we've loaded the
1805 * swash, we don't have to check for that each time through the loop */
1806 REXEC_FBC_UTF8_CLASS_SCAN(
1807 to_complement ^ cBOOL(_generic_utf8(
1808 classnum,
1809 s,
1810 swash_fetch(PL_utf8_swash_ptrs[classnum],
1811 (U8 *) s, TRUE))));
73104a1b
KW
1812 break;
1813
1814 case AHOCORASICKC:
1815 case AHOCORASICK:
1816 {
1817 DECL_TRIE_TYPE(c);
1818 /* what trie are we using right now */
1819 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1820 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1821 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1822
1823 const char *last_start = strend - trie->minlen;
6148ee25 1824#ifdef DEBUGGING
73104a1b 1825 const char *real_start = s;
6148ee25 1826#endif
73104a1b
KW
1827 STRLEN maxlen = trie->maxlen;
1828 SV *sv_points;
1829 U8 **points; /* map of where we were in the input string
1830 when reading a given char. For ASCII this
1831 is unnecessary overhead as the relationship
1832 is always 1:1, but for Unicode, especially
1833 case folded Unicode this is not true. */
1834 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1835 U8 *bitmap=NULL;
1836
1837
1838 GET_RE_DEBUG_FLAGS_DECL;
1839
1840 /* We can't just allocate points here. We need to wrap it in
1841 * an SV so it gets freed properly if there is a croak while
1842 * running the match */
1843 ENTER;
1844 SAVETMPS;
1845 sv_points=newSV(maxlen * sizeof(U8 *));
1846 SvCUR_set(sv_points,
1847 maxlen * sizeof(U8 *));
1848 SvPOK_on(sv_points);
1849 sv_2mortal(sv_points);
1850 points=(U8**)SvPV_nolen(sv_points );
1851 if ( trie_type != trie_utf8_fold
1852 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1853 {
1854 if (trie->bitmap)
1855 bitmap=(U8*)trie->bitmap;
1856 else
1857 bitmap=(U8*)ANYOF_BITMAP(c);
1858 }
1859 /* this is the Aho-Corasick algorithm modified a touch
1860 to include special handling for long "unknown char" sequences.
1861 The basic idea being that we use AC as long as we are dealing
1862 with a possible matching char, when we encounter an unknown char
1863 (and we have not encountered an accepting state) we scan forward
1864 until we find a legal starting char.
1865 AC matching is basically that of trie matching, except that when
1866 we encounter a failing transition, we fall back to the current
1867 states "fail state", and try the current char again, a process
1868 we repeat until we reach the root state, state 1, or a legal
1869 transition. If we fail on the root state then we can either
1870 terminate if we have reached an accepting state previously, or
1871 restart the entire process from the beginning if we have not.
1872
1873 */
1874 while (s <= last_start) {
1875 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1876 U8 *uc = (U8*)s;
1877 U16 charid = 0;
1878 U32 base = 1;
1879 U32 state = 1;
1880 UV uvc = 0;
1881 STRLEN len = 0;
1882 STRLEN foldlen = 0;
1883 U8 *uscan = (U8*)NULL;
1884 U8 *leftmost = NULL;
1885#ifdef DEBUGGING
1886 U32 accepted_word= 0;
786e8c11 1887#endif
73104a1b
KW
1888 U32 pointpos = 0;
1889
1890 while ( state && uc <= (U8*)strend ) {
1891 int failed=0;
1892 U32 word = aho->states[ state ].wordnum;
1893
1894 if( state==1 ) {
1895 if ( bitmap ) {
1896 DEBUG_TRIE_EXECUTE_r(
1897 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1898 dump_exec_pos( (char *)uc, c, strend, real_start,
1899 (char *)uc, utf8_target );
1900 PerlIO_printf( Perl_debug_log,
1901 " Scanning for legal start char...\n");
1902 }
1903 );
1904 if (utf8_target) {
1905 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906 uc += UTF8SKIP(uc);
1907 }
1908 } else {
1909 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1910 uc++;
1911 }
786e8c11 1912 }
73104a1b 1913 s= (char *)uc;
07be1b83 1914 }
73104a1b
KW
1915 if (uc >(U8*)last_start) break;
1916 }
1917
1918 if ( word ) {
1919 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1920 if (!leftmost || lpos < leftmost) {
1921 DEBUG_r(accepted_word=word);
1922 leftmost= lpos;
7016d6eb 1923 }
73104a1b 1924 if (base==0) break;
7016d6eb 1925
73104a1b
KW
1926 }
1927 points[pointpos++ % maxlen]= uc;
1928 if (foldlen || uc < (U8*)strend) {
1929 REXEC_TRIE_READ_CHAR(trie_type, trie,
1930 widecharmap, uc,
1931 uscan, len, uvc, charid, foldlen,
1932 foldbuf, uniflags);
1933 DEBUG_TRIE_EXECUTE_r({
1934 dump_exec_pos( (char *)uc, c, strend,
1935 real_start, s, utf8_target);
1936 PerlIO_printf(Perl_debug_log,
1937 " Charid:%3u CP:%4"UVxf" ",
1938 charid, uvc);
1939 });
1940 }
1941 else {
1942 len = 0;
1943 charid = 0;
1944 }
07be1b83 1945
73104a1b
KW
1946
1947 do {
6148ee25 1948#ifdef DEBUGGING
73104a1b 1949 word = aho->states[ state ].wordnum;
6148ee25 1950#endif
73104a1b
KW
1951 base = aho->states[ state ].trans.base;
1952
1953 DEBUG_TRIE_EXECUTE_r({
1954 if (failed)
1955 dump_exec_pos( (char *)uc, c, strend, real_start,
1956 s, utf8_target );
1957 PerlIO_printf( Perl_debug_log,
1958 "%sState: %4"UVxf", word=%"UVxf,
1959 failed ? " Fail transition to " : "",
1960 (UV)state, (UV)word);
1961 });
1962 if ( base ) {
1963 U32 tmp;
1964 I32 offset;
1965 if (charid &&
1966 ( ((offset = base + charid
1967 - 1 - trie->uniquecharcount)) >= 0)
1968 && ((U32)offset < trie->lasttrans)
1969 && trie->trans[offset].check == state
1970 && (tmp=trie->trans[offset].next))
1971 {
1972 DEBUG_TRIE_EXECUTE_r(
1973 PerlIO_printf( Perl_debug_log," - legal\n"));
1974 state = tmp;
1975 break;
07be1b83
YO
1976 }
1977 else {
786e8c11 1978 DEBUG_TRIE_EXECUTE_r(
73104a1b 1979 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 1980 failed = 1;
73104a1b 1981 state = aho->fail[state];
07be1b83 1982 }
07be1b83 1983 }
73104a1b
KW
1984 else {
1985 /* we must be accepting here */
1986 DEBUG_TRIE_EXECUTE_r(
1987 PerlIO_printf( Perl_debug_log," - accepting\n"));
1988 failed = 1;
1989 break;
786e8c11 1990 }
73104a1b
KW
1991 } while(state);
1992 uc += len;
1993 if (failed) {
1994 if (leftmost)
1995 break;
1996 if (!state) state = 1;
07be1b83 1997 }
73104a1b
KW
1998 }
1999 if ( aho->states[ state ].wordnum ) {
2000 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2001 if (!leftmost || lpos < leftmost) {
2002 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2003 leftmost = lpos;
07be1b83
YO
2004 }
2005 }
73104a1b
KW
2006 if (leftmost) {
2007 s = (char*)leftmost;
2008 DEBUG_TRIE_EXECUTE_r({
2009 PerlIO_printf(
2010 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2011 (UV)accepted_word, (IV)(s - real_start)
2012 );
2013 });
2014 if (!reginfo || regtry(reginfo, &s)) {
2015 FREETMPS;
2016 LEAVE;
2017 goto got_it;
2018 }
2019 s = HOPc(s,1);
2020 DEBUG_TRIE_EXECUTE_r({
2021 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2022 });
2023 } else {
2024 DEBUG_TRIE_EXECUTE_r(
2025 PerlIO_printf( Perl_debug_log,"No match.\n"));
2026 break;
2027 }
2028 }
2029 FREETMPS;
2030 LEAVE;
2031 }
2032 break;
2033 default:
2034 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2035 break;
2036 }
2037 return 0;
2038 got_it:
2039 return s;
6eb5f6b9
JH
2040}
2041
fae667d5 2042
6eb5f6b9
JH
2043/*
2044 - regexec_flags - match a regexp against a string
2045 */
2046I32
5aaab254 2047Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2048 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2049/* stringarg: the point in the string at which to begin matching */
2050/* strend: pointer to null at end of string */
2051/* strbeg: real beginning of string */
2052/* minend: end of match must be >= minend bytes after stringarg. */
2053/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2054 * itself is accessed via the pointers above */
2055/* data: May be used for some additional optimizations.
2056 Currently its only used, with a U32 cast, for transmitting
2057 the ganch offset when doing a /g match. This will change */
2058/* nosave: For optimizations. */
2059
6eb5f6b9 2060{
97aff369 2061 dVAR;
8d919b0a 2062 struct regexp *const prog = ReANY(rx);
5aaab254 2063 char *s;
eb578fdb 2064 regnode *c;
5aaab254 2065 char *startpos = stringarg;
6eb5f6b9
JH
2066 I32 minlen; /* must match at least this many chars */
2067 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2068 I32 end_shift = 0; /* Same for the end. */ /* CC */
2069 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2070 char *scream_olds = NULL;
f2ed9b32 2071 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2072 I32 multiline;
f8fc2ecf 2073 RXi_GET_DECL(prog,progi);
3b0527fe 2074 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2075 regexp_paren_pair *swap = NULL;
a3621e74
YO
2076 GET_RE_DEBUG_FLAGS_DECL;
2077
7918f24d 2078 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2079 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2080
2081 /* Be paranoid... */
2082 if (prog == NULL || startpos == NULL) {
2083 Perl_croak(aTHX_ "NULL regexp parameter");
2084 return 0;
2085 }
2086
bbe252da 2087 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2088 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2089
f2ed9b32 2090 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2091 DEBUG_EXECUTE_r(
f2ed9b32 2092 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2093 "Matching");
2094 );
bac06658 2095
6eb5f6b9 2096 minlen = prog->minlen;
1de06328
YO
2097
2098 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2100 "String too short [regexec_flags]...\n"));
2101 goto phooey;
1aa99e6b 2102 }
6eb5f6b9 2103
1de06328 2104
6eb5f6b9 2105 /* Check validity of program. */
f8fc2ecf 2106 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2107 Perl_croak(aTHX_ "corrupted regexp program");
2108 }
2109
272d35c9 2110 RX_MATCH_TAINTED_off(rx);
ed301438 2111 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2112 PL_reg_maxiter = 0;
2113
984e6dd1 2114 reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
39819bd9 2115 reginfo.warned = FALSE;
6eb5f6b9 2116 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2117 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2118 PL_bostr = strbeg;
3b0527fe 2119 reginfo.sv = sv;
6eb5f6b9
JH
2120
2121 /* Mark end of line for $ (and such) */
2122 PL_regeol = strend;
2123
2124 /* see how far we have to get to not match where we matched before */
3b0527fe 2125 reginfo.till = startpos+minend;
6eb5f6b9 2126
6eb5f6b9
JH
2127 /* If there is a "must appear" string, look for it. */
2128 s = startpos;
2129
bbe252da 2130 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2131 MAGIC *mg;
2c296965 2132 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2133 reginfo.ganch = startpos + prog->gofs;
2c296965 2134 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2135 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2136 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2137 && SvMAGIC(sv)
14befaf4
DM
2138 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2139 && mg->mg_len >= 0) {
3b0527fe 2140 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2141 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2142 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2143
bbe252da 2144 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2145 if (s > reginfo.ganch)
6eb5f6b9 2146 goto phooey;
58e23c8d 2147 s = reginfo.ganch - prog->gofs;
2c296965 2148 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2149 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2150 if (s < strbeg)
2151 goto phooey;
6eb5f6b9
JH
2152 }
2153 }
58e23c8d 2154 else if (data) {
70685ca0 2155 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2156 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2157 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2158
2159 } else { /* pos() not defined */
3b0527fe 2160 reginfo.ganch = strbeg;
2c296965
YO
2161 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2162 "GPOS: reginfo.ganch = strbeg\n"));
2163 }
6eb5f6b9 2164 }
288b8c02 2165 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2166 /* We have to be careful. If the previous successful match
2167 was from this regex we don't want a subsequent partially
2168 successful match to clobber the old results.
2169 So when we detect this possibility we add a swap buffer
d8da0584
KW
2170 to the re, and switch the buffer each match. If we fail,
2171 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2172 */
2173 swap = prog->offs;
2174 /* do we need a save destructor here for eval dies? */
2175 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2176 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2177 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2178 PTR2UV(prog),
2179 PTR2UV(swap),
2180 PTR2UV(prog->offs)
2181 ));
c74340f9 2182 }
a0714e2c 2183 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2184 re_scream_pos_data d;
2185
2186 d.scream_olds = &scream_olds;
2187 d.scream_pos = &scream_pos;
288b8c02 2188 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2189 if (!s) {
a3621e74 2190 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2191 goto phooey; /* not present */
3fa9c3d7 2192 }
6eb5f6b9
JH
2193 }
2194
1de06328 2195
6eb5f6b9
JH
2196
2197 /* Simplest case: anchored match need be tried only once. */
2198 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2199 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2200 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2201 goto got_it;
bbe252da
YO
2202 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2203 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2204 {
2205 char *end;
2206
2207 if (minlen)
2208 dontbother = minlen - 1;
1aa99e6b 2209 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2210 /* for multiline we only have to try after newlines */
33b8afdf 2211 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2212 /* because of the goto we can not easily reuse the macros for bifurcating the
2213 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2214 if (utf8_target) {
2215 if (s == startpos)
2216 goto after_try_utf8;
2217 while (1) {
2218 if (regtry(&reginfo, &s)) {
2219 goto got_it;
2220 }
2221 after_try_utf8:
2222 if (s > end) {
2223 goto phooey;
2224 }
2225 if (prog->extflags & RXf_USE_INTUIT) {
2226 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2227 if (!s) {
2228 goto phooey;
2229 }
2230 }
2231 else {
2232 s += UTF8SKIP(s);
2233 }
2234 }
2235 } /* end search for check string in unicode */
2236 else {
2237 if (s == startpos) {
2238 goto after_try_latin;
2239 }
2240 while (1) {
2241 if (regtry(&reginfo, &s)) {
2242 goto got_it;
2243 }
2244 after_try_latin:
2245 if (s > end) {
2246 goto phooey;
2247 }
2248 if (prog->extflags & RXf_USE_INTUIT) {
2249 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2250 if (!s) {
2251 goto phooey;
2252 }
2253 }
2254 else {
2255 s++;
2256 }
2257 }
2258 } /* end search for check string in latin*/
2259 } /* end search for check string */
2260 else { /* search for newline */
2261 if (s > startpos) {
2262 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2263 s--;
92f3d482 2264 }
21eede78
YO
2265 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2266 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2267 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2268 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2269 goto got_it;
2270 }
92f3d482
YO
2271 }
2272 } /* end search for newline */
2273 } /* end anchored/multiline check string search */
6eb5f6b9 2274 goto phooey;
bbe252da 2275 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2276 {
486ec47a 2277 /* the warning about reginfo.ganch being used without initialization
bbe252da 2278 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2279 and we only enter this block when the same bit is set. */
58e23c8d 2280 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2281
2282 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2283 goto got_it;
2284 goto phooey;
2285 }
2286
2287 /* Messy cases: unanchored match. */
bbe252da 2288 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2289 /* we have /x+whatever/ */
984e6dd1 2290 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2291 char ch;
bf93d4cc
GS
2292#ifdef DEBUGGING
2293 int did_match = 0;
2294#endif
f2ed9b32 2295 if (utf8_target) {
7e0d5ad7
KW
2296 if (! prog->anchored_utf8) {
2297 to_utf8_substr(prog);
2298 }
2299 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2300 REXEC_FBC_SCAN(
6eb5f6b9 2301 if (*s == ch) {
a3621e74 2302 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2303 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2304 s += UTF8SKIP(s);
2305 while (s < strend && *s == ch)
2306 s += UTF8SKIP(s);
2307 }
4cadc6a9 2308 );
7e0d5ad7 2309
6eb5f6b9
JH
2310 }
2311 else {
7e0d5ad7
KW
2312 if (! prog->anchored_substr) {
2313 if (! to_byte_substr(prog)) {
6b54ddc5 2314 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2315 }
2316 }
2317 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2318 REXEC_FBC_SCAN(
6eb5f6b9 2319 if (*s == ch) {
a3621e74 2320 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2321 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2322 s++;
2323 while (s < strend && *s == ch)
2324 s++;
2325 }
4cadc6a9 2326 );
6eb5f6b9 2327 }
a3621e74 2328 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2329 PerlIO_printf(Perl_debug_log,
b7953727
JH
2330 "Did not find anchored character...\n")
2331 );
6eb5f6b9 2332 }
a0714e2c
SS
2333 else if (prog->anchored_substr != NULL
2334 || prog->anchored_utf8 != NULL
2335 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2336 && prog->float_max_offset < strend - s)) {
2337 SV *must;
2338 I32 back_max;
2339 I32 back_min;
2340 char *last;
6eb5f6b9 2341 char *last1; /* Last position checked before */
bf93d4cc
GS
2342#ifdef DEBUGGING
2343 int did_match = 0;
2344#endif
33b8afdf 2345 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2346 if (utf8_target) {
2347 if (! prog->anchored_utf8) {
2348 to_utf8_substr(prog);
2349 }
2350 must = prog->anchored_utf8;
2351 }
2352 else {
2353 if (! prog->anchored_substr) {
2354 if (! to_byte_substr(prog)) {
6b54ddc5 2355 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2356 }
2357 }
2358 must = prog->anchored_substr;
2359 }
33b8afdf
JH
2360 back_max = back_min = prog->anchored_offset;
2361 } else {
7e0d5ad7
KW
2362 if (utf8_target) {
2363 if (! prog->float_utf8) {
2364 to_utf8_substr(prog);
2365 }
2366 must = prog->float_utf8;
2367 }
2368 else {
2369 if (! prog->float_substr) {
2370 if (! to_byte_substr(prog)) {
6b54ddc5 2371 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2372 }
2373 }
2374 must = prog->float_substr;
2375 }
33b8afdf
JH
2376 back_max = prog->float_max_offset;
2377 back_min = prog->float_min_offset;
2378 }
1de06328 2379
1de06328
YO
2380 if (back_min<0) {
2381 last = strend;
2382 } else {
2383 last = HOP3c(strend, /* Cannot start after this */
2384 -(I32)(CHR_SVLEN(must)
2385 - (SvTAIL(must) != 0) + back_min), strbeg);
2386 }
6eb5f6b9
JH
2387 if (s > PL_bostr)
2388 last1 = HOPc(s, -1);
2389 else
2390 last1 = s - 1; /* bogus */
2391
a0288114 2392 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2393 check_substr==must. */
2394 scream_pos = -1;
2395 dontbother = end_shift;
2396 strend = HOPc(strend, -dontbother);
2397 while ( (s <= last) &&
c33e64f0 2398 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2399 (unsigned char*)strend, must,
c33e64f0 2400 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2401 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2402 if (HOPc(s, -back_max) > last1) {
2403 last1 = HOPc(s, -back_min);
2404 s = HOPc(s, -back_max);
2405 }
2406 else {
52657f30 2407 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2408
2409 last1 = HOPc(s, -back_min);
52657f30 2410 s = t;
6eb5f6b9 2411 }
f2ed9b32 2412 if (utf8_target) {
6eb5f6b9 2413 while (s <= last1) {
24b23f37 2414 if (regtry(&reginfo, &s))
6eb5f6b9 2415 goto got_it;
7016d6eb
DM
2416 if (s >= last1) {
2417 s++; /* to break out of outer loop */
2418 break;
2419 }
2420 s += UTF8SKIP(s);
6eb5f6b9
JH
2421 }
2422 }
2423 else {
2424 while (s <= last1) {
24b23f37 2425 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2426 goto got_it;
2427 s++;
2428 }
2429 }
2430 }
ab3bbdeb 2431 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2432 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2433 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2434 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2435 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2436 ? "anchored" : "floating"),
ab3bbdeb
YO
2437 quoted, RE_SV_TAIL(must));
2438 });
6eb5f6b9
JH
2439 goto phooey;
2440 }
f8fc2ecf 2441 else if ( (c = progi->regstclass) ) {
f14c76ed 2442 if (minlen) {
f8fc2ecf 2443 const OPCODE op = OP(progi->regstclass);
66e933ab 2444 /* don't bother with what can't match */
786e8c11 2445 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2446 strend = HOPc(strend, -(minlen - 1));
2447 }
a3621e74 2448 DEBUG_EXECUTE_r({
be8e71aa 2449 SV * const prop = sv_newmortal();
32fc9b6a 2450 regprop(prog, prop, c);
0df25f3d 2451 {
f2ed9b32 2452 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2453 s,strend-s,60);
0df25f3d 2454 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2455 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2456 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2457 quoted, (int)(strend - s));
0df25f3d 2458 }
ffc61ed2 2459 });
984e6dd1 2460 if (find_byclass(prog, c, s, strend, &reginfo, reginfo.is_utf8_pat))
6eb5f6b9 2461 goto got_it;
07be1b83 2462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2463 }
2464 else {
2465 dontbother = 0;
a0714e2c 2466 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2467 /* Trim the end. */
6af40bd7 2468 char *last= NULL;
33b8afdf 2469 SV* float_real;
c33e64f0
FC
2470 STRLEN len;
2471 const char *little;
33b8afdf 2472
7e0d5ad7
KW
2473 if (utf8_target) {
2474 if (! prog->float_utf8) {
2475 to_utf8_substr(prog);
2476 }
2477 float_real = prog->float_utf8;
2478 }
2479 else {
2480 if (! prog->float_substr) {
2481 if (! to_byte_substr(prog)) {
6b54ddc5 2482 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2483 }
2484 }
2485 float_real = prog->float_substr;
2486 }
d6a28714 2487
c33e64f0
FC
2488 little = SvPV_const(float_real, len);
2489 if (SvTAIL(float_real)) {
7f18ad16
KW
2490 /* This means that float_real contains an artificial \n on
2491 * the end due to the presence of something like this:
2492 * /foo$/ where we can match both "foo" and "foo\n" at the
2493 * end of the string. So we have to compare the end of the
2494 * string first against the float_real without the \n and
2495 * then against the full float_real with the string. We
2496 * have to watch out for cases where the string might be
2497 * smaller than the float_real or the float_real without
2498 * the \n. */
1a13b075
YO
2499 char *checkpos= strend - len;
2500 DEBUG_OPTIMISE_r(
2501 PerlIO_printf(Perl_debug_log,
2502 "%sChecking for float_real.%s\n",
2503 PL_colors[4], PL_colors[5]));
2504 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2505 /* can't match, even if we remove the trailing \n
2506 * string is too short to match */
1a13b075
YO
2507 DEBUG_EXECUTE_r(
2508 PerlIO_printf(Perl_debug_log,
2509 "%sString shorter than required trailing substring, cannot match.%s\n",
2510 PL_colors[4], PL_colors[5]));
2511 goto phooey;
2512 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2513 /* can match, the end of the string matches without the
2514 * "\n" */
1a13b075
YO
2515 last = checkpos + 1;
2516 } else if (checkpos < strbeg) {
7f18ad16
KW
2517 /* cant match, string is too short when the "\n" is
2518 * included */
1a13b075
YO
2519 DEBUG_EXECUTE_r(
2520 PerlIO_printf(Perl_debug_log,
2521 "%sString does not contain required trailing substring, cannot match.%s\n",
2522 PL_colors[4], PL_colors[5]));
2523 goto phooey;
2524 } else if (!multiline) {
7f18ad16
KW
2525 /* non multiline match, so compare with the "\n" at the
2526 * end of the string */
1a13b075
YO
2527 if (memEQ(checkpos, little, len)) {
2528 last= checkpos;
2529 } else {
2530 DEBUG_EXECUTE_r(
2531 PerlIO_printf(Perl_debug_log,
2532 "%sString does not contain required trailing substring, cannot match.%s\n",
2533 PL_colors[4], PL_colors[5]));
2534 goto phooey;
2535 }
2536 } else {
7f18ad16
KW
2537 /* multiline match, so we have to search for a place
2538 * where the full string is located */
d6a28714 2539 goto find_last;
1a13b075 2540 }
c33e64f0 2541 } else {
d6a28714 2542 find_last:
9041c2e3 2543 if (len)
d6a28714 2544 last = rninstr(s, strend, little, little + len);
b8c5462f 2545 else
a0288114 2546 last = strend; /* matching "$" */
b8c5462f 2547 }
6af40bd7 2548 if (!last) {
7f18ad16
KW
2549 /* at one point this block contained a comment which was
2550 * probably incorrect, which said that this was a "should not
2551 * happen" case. Even if it was true when it was written I am
2552 * pretty sure it is not anymore, so I have removed the comment
2553 * and replaced it with this one. Yves */
6bda09f9
YO
2554 DEBUG_EXECUTE_r(
2555 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2556 "String does not contain required substring, cannot match.\n"
2557 ));
2558 goto phooey;
bf93d4cc 2559 }
d6a28714
JH
2560 dontbother = strend - last + prog->float_min_offset;
2561 }
2562 if (minlen && (dontbother < minlen))
2563 dontbother = minlen - 1;
2564 strend -= dontbother; /* this one's always in bytes! */
2565 /* We don't know much -- general case. */
f2ed9b32 2566 if (utf8_target) {
d6a28714 2567 for (;;) {
24b23f37 2568 if (regtry(&reginfo, &s))
d6a28714
JH
2569 goto got_it;
2570 if (s >= strend)
2571 break;
b8c5462f 2572 s += UTF8SKIP(s);
d6a28714
JH
2573 };
2574 }
2575 else {
2576 do {
24b23f37 2577 if (regtry(&reginfo, &s))
d6a28714
JH
2578 goto got_it;
2579 } while (s++ < strend);
2580 }
2581 }
2582
2583 /* Failure. */
2584 goto phooey;
2585
2586got_it:
495f47a5
DM
2587 DEBUG_BUFFERS_r(
2588 if (swap)
2589 PerlIO_printf(Perl_debug_log,
2590 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2591 PTR2UV(prog),
2592 PTR2UV(swap)
2593 );
2594 );
e9105d30 2595 Safefree(swap);
d6a28714 2596
ed301438 2597 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2598 restore_pos(aTHX_ prog);
5daac39c
NC
2599 if (RXp_PAREN_NAMES(prog))
2600 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2601
2602 /* make sure $`, $&, $', and $digit will work later */
2603 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2604 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2605#ifdef PERL_ANY_COW
2606 if (SvCANCOW(sv)) {
ed252734
NC
2607 if (DEBUG_C_TEST) {
2608 PerlIO_printf(Perl_debug_log,
2609 "Copy on write: regexp capture, type %d\n",
2610 (int) SvTYPE(sv));
2611 }
77f8f7c1 2612 RX_MATCH_COPY_FREE(rx);
ed252734 2613 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2614 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2615 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2616 prog->sublen = PL_regeol - strbeg;
2617 prog->suboffset = 0;
2618 prog->subcoffset = 0;
ed252734
NC
2619 } else
2620#endif
2621 {
6502e081
DM
2622 I32 min = 0;
2623 I32 max = PL_regeol - strbeg;
2624 I32 sublen;
2625
2626 if ( (flags & REXEC_COPY_SKIP_POST)
2627 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2628 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2629 ) { /* don't copy $' part of string */
3de645a8 2630 U32 n = 0;
6502e081
DM
2631 max = -1;
2632 /* calculate the right-most part of the string covered
2633 * by a capture. Due to look-ahead, this may be to
2634 * the right of $&, so we have to scan all captures */
2635 while (n <= prog->lastparen) {
2636 if (prog->offs[n].end > max)
2637 max = prog->offs[n].end;
2638 n++;
2639 }
2640 if (max == -1)
2641 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2642 ? prog->offs[0].start
2643 : 0;
2644 assert(max >= 0 && max <= PL_regeol - strbeg);
2645 }
2646
2647 if ( (flags & REXEC_COPY_SKIP_PRE)
2648 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2649 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2650 ) { /* don't copy $` part of string */
3de645a8 2651 U32 n = 0;
6502e081
DM
2652 min = max;
2653 /* calculate the left-most part of the string covered
2654 * by a capture. Due to look-behind, this may be to
2655 * the left of $&, so we have to scan all captures */
2656 while (min && n <= prog->lastparen) {
2657 if ( prog->offs[n].start != -1
2658 && prog->offs[n].start < min)
2659 {
2660 min = prog->offs[n].start;
2661 }
2662 n++;
2663 }
2664 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2665 && min > prog->offs[0].end
2666 )
2667 min = prog->offs[0].end;
2668
2669 }
2670
2671 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2672 sublen = max - min;
2673
2674 if (RX_MATCH_COPIED(rx)) {
2675 if (sublen > prog->sublen)
2676 prog->subbeg =
2677 (char*)saferealloc(prog->subbeg, sublen+1);
2678 }
2679 else
2680 prog->subbeg = (char*)safemalloc(sublen+1);
2681 Copy(strbeg + min, prog->subbeg, sublen, char);
2682 prog->subbeg[sublen] = '\0';
2683 prog->suboffset = min;
2684 prog->sublen = sublen;
77f8f7c1 2685 RX_MATCH_COPIED_on(rx);
6502e081 2686 }
6502e081
DM
2687 prog->subcoffset = prog->suboffset;
2688 if (prog->suboffset && utf8_target) {
2689 /* Convert byte offset to chars.
2690 * XXX ideally should only compute this if @-/@+
2691 * has been seen, a la PL_sawampersand ??? */
2692
2693 /* If there's a direct correspondence between the
2694 * string which we're matching and the original SV,
2695 * then we can use the utf8 len cache associated with
2696 * the SV. In particular, it means that under //g,
2697 * sv_pos_b2u() will use the previously cached
2698 * position to speed up working out the new length of
2699 * subcoffset, rather than counting from the start of
2700 * the string each time. This stops
2701 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2702 * from going quadratic */
2703 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2704 sv_pos_b2u(sv, &(prog->subcoffset));
2705 else
2706 prog->subcoffset = utf8_length((U8*)strbeg,
2707 (U8*)(strbeg+prog->suboffset));
2708 }
d6a28714
JH
2709 }
2710 else {
6502e081 2711 RX_MATCH_COPY_FREE(rx);
d6a28714 2712 prog->subbeg = strbeg;
6502e081
DM
2713 prog->suboffset = 0;
2714 prog->subcoffset = 0;
d6a28714
JH
2715 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2716 }
2717 }
9041c2e3 2718
d6a28714
JH
2719 return 1;
2720
2721phooey:
a3621e74 2722 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2723 PL_colors[4], PL_colors[5]));
ed301438 2724 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2725 restore_pos(aTHX_ prog);
e9105d30 2726 if (swap) {
c74340f9 2727 /* we failed :-( roll it back */
495f47a5
DM
2728 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2729 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2730 PTR2UV(prog),
2731 PTR2UV(prog->offs),
2732 PTR2UV(swap)
2733 ));
e9105d30
GG
2734 Safefree(prog->offs);
2735 prog->offs = swap;
2736 }
d6a28714
JH
2737 return 0;
2738}
2739
6bda09f9 2740
ec43f78b
DM
2741/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2742 * Do inc before dec, in case old and new rex are the same */
2743#define SET_reg_curpm(Re2) \
2744 if (PL_reg_state.re_state_eval_setup_done) { \
2745 (void)ReREFCNT_inc(Re2); \
2746 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2747 PM_SETRE((PL_reg_curpm), (Re2)); \
2748 }
2749
2750
d6a28714
JH
2751/*
2752 - regtry - try match at specific point
2753 */
2754STATIC I32 /* 0 failure, 1 success */
f73aaa43 2755S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2756{
97aff369 2757 dVAR;
d6a28714 2758 CHECKPOINT lastcp;
288b8c02 2759 REGEXP *const rx = reginfo->prog;
8d919b0a 2760 regexp *const prog = ReANY(rx);
f73aaa43 2761 I32 result;
f8fc2ecf 2762 RXi_GET_DECL(prog,progi);
a3621e74 2763 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2764
2765 PERL_ARGS_ASSERT_REGTRY;
2766
24b23f37 2767 reginfo->cutpoint=NULL;
d6a28714 2768
ed301438
DM
2769 if ((prog->extflags & RXf_EVAL_SEEN)
2770 && !PL_reg_state.re_state_eval_setup_done)
2771 {
d6a28714
JH
2772 MAGIC *mg;
2773
ed301438 2774 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2775 if (reginfo->sv) {
d6a28714 2776 /* Make $_ available to executed code. */
3b0527fe 2777 if (reginfo->sv != DEFSV) {
59f00321 2778 SAVE_DEFSV;
414bf5ae 2779 DEFSV_set(reginfo->sv);
b8c5462f 2780 }
d6a28714 2781
3b0527fe
DM
2782 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2783 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2784 /* prepare for quick setting of pos */
d300d9fa 2785#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2786 if (SvIsCOW(reginfo->sv))
2787 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2788#endif
3dab1dad 2789 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2790 &PL_vtbl_mglob, NULL, 0);
d6a28714 2791 mg->mg_len = -1;
b8c5462f 2792 }
d6a28714
JH
2793 PL_reg_magic = mg;
2794 PL_reg_oldpos = mg->mg_len;
4f639d21 2795 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2796 }
09687e5a 2797 if (!PL_reg_curpm) {
a02a5408 2798 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2799#ifdef USE_ITHREADS
2800 {
14a49a24 2801 SV* const repointer = &PL_sv_undef;
92313705
NC
2802 /* this regexp is also owned by the new PL_reg_curpm, which
2803 will try to free it. */
d2ece331 2804 av_push(PL_regex_padav, repointer);
09687e5a
AB
2805 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2806 PL_regex_pad = AvARRAY(PL_regex_padav);
2807 }
2808#endif
2809 }
ec43f78b 2810 SET_reg_curpm(rx);
d6a28714
JH
2811 PL_reg_oldcurpm = PL_curpm;
2812 PL_curpm = PL_reg_curpm;
07bc277f 2813 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2814 /* Here is a serious problem: we cannot rewrite subbeg,
2815 since it may be needed if this match fails. Thus
2816 $` inside (?{}) could fail... */
2817 PL_reg_oldsaved = prog->subbeg;
2818 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2819 PL_reg_oldsavedoffset = prog->suboffset;
2820 PL_reg_oldsavedcoffset = prog->suboffset;
db2c6cb3 2821#ifdef PERL_ANY_COW
ed252734
NC
2822 PL_nrs = prog->saved_copy;
2823#endif
07bc277f 2824 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2825 }
2826 else
bd61b366 2827 PL_reg_oldsaved = NULL;
d6a28714 2828 prog->subbeg = PL_bostr;
6502e081
DM
2829 prog->suboffset = 0;
2830 prog->subcoffset = 0;
d6a28714
JH
2831 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2832 }
97ca13b7 2833#ifdef DEBUGGING
f73aaa43 2834 PL_reg_starttry = *startposp;
97ca13b7 2835#endif
f73aaa43 2836 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2837 prog->lastparen = 0;
03994de8 2838 prog->lastcloseparen = 0;
d6a28714
JH
2839
2840 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2841 to do this again and again, prog->lastparen should take care of
3dd2943c 2842 this! --ilya*/
dafc8851
JH
2843
2844 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2845 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2846 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2847 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2848 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2849 * Meanwhile, this code *is* needed for the
daf18116
JH
2850 * above-mentioned test suite tests to succeed. The common theme
2851 * on those tests seems to be returning null fields from matches.
225593e1 2852 * --jhi updated by dapm */
dafc8851 2853#if 1
d6a28714 2854 if (prog->nparens) {
b93070ed 2855 regexp_paren_pair *pp = prog->offs;
eb578fdb 2856 I32 i;
b93070ed 2857 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2858 ++pp;
2859 pp->start = -1;
2860 pp->end = -1;
d6a28714
JH
2861 }
2862 }
dafc8851 2863#endif
02db2b7b 2864 REGCP_SET(lastcp);
f73aaa43
DM
2865 result = regmatch(reginfo, *startposp, progi->program + 1);
2866 if (result != -1) {
2867 prog->offs[0].end = result;
d6a28714
JH
2868 return 1;
2869 }
24b23f37 2870 if (reginfo->cutpoint)
f73aaa43 2871 *startposp= reginfo->cutpoint;
02db2b7b 2872 REGCP_UNWIND(lastcp);
d6a28714
JH
2873 return 0;
2874}
2875
02db2b7b 2876
8ba1375e
MJD
2877#define sayYES goto yes
2878#define sayNO goto no
262b90c4 2879#define sayNO_SILENT goto no_silent
8ba1375e 2880
f9f4320a
YO
2881/* we dont use STMT_START/END here because it leads to
2882 "unreachable code" warnings, which are bogus, but distracting. */
2883#define CACHEsayNO \
c476f425
DM
2884 if (ST.cache_mask) \
2885 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2886 sayNO
3298f257 2887
a3621e74 2888/* this is used to determine how far from the left messages like
265c4333
YO
2889 'failed...' are printed. It should be set such that messages
2890 are inline with the regop output that created them.
a3621e74 2891*/
265c4333 2892#define REPORT_CODE_OFF 32
a3621e74
YO
2893
2894
40a82448
DM
2895#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2896#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2897#define CHRTEST_NOT_A_CP_1 -999
2898#define CHRTEST_NOT_A_CP_2 -998
9e137952 2899
86545054
DM
2900#define SLAB_FIRST(s) (&(s)->states[0])
2901#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2902
5d9a96ca
DM
2903/* grab a new slab and return the first slot in it */
2904
2905STATIC regmatch_state *
2906S_push_slab(pTHX)
2907{
a35a87e7 2908#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2909 dMY_CXT;
2910#endif
5d9a96ca
DM
2911 regmatch_slab *s = PL_regmatch_slab->next;
2912 if (!s) {
2913 Newx(s, 1, regmatch_slab);
2914 s->prev = PL_regmatch_slab;
2915 s->next = NULL;
2916 PL_regmatch_slab->next = s;
2917 }
2918 PL_regmatch_slab = s;
86545054 2919 return SLAB_FIRST(s);
5d9a96ca 2920}
5b47454d 2921
95b24440 2922
40a82448
DM
2923/* push a new state then goto it */
2924
4d5016e5
DM
2925#define PUSH_STATE_GOTO(state, node, input) \
2926 pushinput = input; \
40a82448
DM
2927 scan = node; \
2928 st->resume_state = state; \
2929 goto push_state;
2930
2931/* push a new state with success backtracking, then goto it */
2932
4d5016e5
DM
2933#define PUSH_YES_STATE_GOTO(state, node, input) \
2934 pushinput = input; \
40a82448
DM
2935 scan = node; \
2936 st->resume_state = state; \
2937 goto push_yes_state;
2938
aa283a38 2939
aa283a38 2940
4d5016e5 2941
d6a28714 2942/*
95b24440 2943
bf1f174e
DM
2944regmatch() - main matching routine
2945
2946This is basically one big switch statement in a loop. We execute an op,
2947set 'next' to point the next op, and continue. If we come to a point which
2948we may need to backtrack to on failure such as (A|B|C), we push a
2949backtrack state onto the backtrack stack. On failure, we pop the top
2950state, and re-enter the loop at the state indicated. If there are no more
2951states to pop, we return failure.
2952
2953Sometimes we also need to backtrack on success; for example /A+/, where
2954after successfully matching one A, we need to go back and try to
2955match another one; similarly for lookahead assertions: if the assertion
2956completes successfully, we backtrack to the state just before the assertion
2957and then carry on. In these cases, the pushed state is marked as
2958'backtrack on success too'. This marking is in fact done by a chain of
2959pointers, each pointing to the previous 'yes' state. On success, we pop to
2960the nearest yes state, discarding any intermediate failure-only states.
2961Sometimes a yes state is pushed just to force some cleanup code to be
2962called at the end of a successful match or submatch; e.g. (??{$re}) uses
2963it to free the inner regex.
2964
2965Note that failure backtracking rewinds the cursor position, while
2966success backtracking leaves it alone.
2967
2968A pattern is complete when the END op is executed, while a subpattern
2969such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2970ops trigger the "pop to last yes state if any, otherwise return true"
2971behaviour.
2972
2973A common convention in this function is to use A and B to refer to the two
2974subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2975the subpattern to be matched possibly multiple times, while B is the entire
2976rest of the pattern. Variable and state names reflect this convention.
2977
2978The states in the main switch are the union of ops and failure/success of
2979substates associated with with that op. For example, IFMATCH is the op
2980that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2981'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2982successfully matched A and IFMATCH_A_fail is a state saying that we have
2983just failed to match A. Resume states always come in pairs. The backtrack
2984state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2985at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2986on success or failure.
2987
2988The struct that holds a backtracking state is actually a big union, with
2989one variant for each major type of op. The variable st points to the
2990top-most backtrack struct. To make the code clearer, within each
2991block of code we #define ST to alias the relevant union.
2992
2993Here's a concrete example of a (vastly oversimplified) IFMATCH
2994implementation:
2995
2996 switch (state) {
2997 ....
2998
2999#define ST st->u.ifmatch
3000
3001 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3002 ST.foo = ...; // some state we wish to save
95b24440 3003 ...
bf1f174e
DM
3004 // push a yes backtrack state with a resume value of
3005 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3006 // first node of A:
4d5016e5 3007 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3008 // NOTREACHED
3009
3010 case IFMATCH_A: // we have successfully executed A; now continue with B
3011 next = B;
3012 bar = ST.foo; // do something with the preserved value
3013 break;
3014
3015 case IFMATCH_A_fail: // A failed, so the assertion failed
3016 ...; // do some housekeeping, then ...
3017 sayNO; // propagate the failure
3018
3019#undef ST
95b24440 3020
bf1f174e
DM
3021 ...
3022 }
95b24440 3023
bf1f174e
DM
3024For any old-timers reading this who are familiar with the old recursive
3025approach, the code above is equivalent to:
95b24440 3026
bf1f174e
DM
3027 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3028 {
3029 int foo = ...
95b24440 3030 ...
bf1f174e
DM
3031 if (regmatch(A)) {
3032 next = B;
3033 bar = foo;
3034 break;
95b24440 3035 }
bf1f174e
DM
3036 ...; // do some housekeeping, then ...
3037 sayNO; // propagate the failure
95b24440 3038 }
bf1f174e
DM
3039
3040The topmost backtrack state, pointed to by st, is usually free. If you
3041want to claim it, populate any ST.foo fields in it with values you wish to
3042save, then do one of
3043
4d5016e5
DM
3044 PUSH_STATE_GOTO(resume_state, node, newinput);
3045 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3046
3047which sets that backtrack state's resume value to 'resume_state', pushes a
3048new free entry to the top of the backtrack stack, then goes to 'node'.
3049On backtracking, the free slot is popped, and the saved state becomes the
3050new free state. An ST.foo field in this new top state can be temporarily
3051accessed to retrieve values, but once the main loop is re-entered, it
3052becomes available for reuse.
3053
3054Note that the depth of the backtrack stack constantly increases during the
3055left-to-right execution of the pattern, rather than going up and down with
3056the pattern nesting. For example the stack is at its maximum at Z at the
3057end of the pattern, rather than at X in the following:
3058
3059 /(((X)+)+)+....(Y)+....Z/
3060
3061The only exceptions to this are lookahead/behind assertions and the cut,
3062(?>A), which pop all the backtrack states associated with A before
3063continuing.
3064
486ec47a 3065Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3066PL_regmatch_state and st always point to the currently active state,
3067and PL_regmatch_slab points to the slab currently containing
3068PL_regmatch_state. The first time regmatch() is called, the first slab is
3069allocated, and is never freed until interpreter destruction. When the slab
3070is full, a new one is allocated and chained to the end. At exit from
3071regmatch(), slabs allocated since entry are freed.
3072
3073*/
95b24440 3074
40a82448 3075
5bc10b2c 3076#define DEBUG_STATE_pp(pp) \
265c4333 3077 DEBUG_STATE_r({ \
f2ed9b32 3078 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3079 PerlIO_printf(Perl_debug_log, \
5d458dd8 3080 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3081 depth*2, "", \
13d6edb4 3082 PL_reg_name[st->resume_state], \
5d458dd8
YO
3083 ((st==yes_state||st==mark_state) ? "[" : ""), \
3084 ((st==yes_state) ? "Y" : ""), \
3085 ((st==mark_state) ? "M" : ""), \
3086 ((st==yes_state||st==mark_state) ? "]" : "") \
3087 ); \
265c4333 3088 });
5bc10b2c 3089
40a82448 3090
3dab1dad 3091#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3092
3df15adc 3093#ifdef DEBUGGING
5bc10b2c 3094
ab3bbdeb 3095STATIC void
f2ed9b32 3096S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3097 const char *start, const char *end, const char *blurb)
3098{
efd26800 3099 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3100
3101 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3102
ab3bbdeb
YO
3103 if (!PL_colorset)
3104 reginitcolors();
3105 {
3106 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3107 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3108
f2ed9b32 3109 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3110 start, end - start, 60);
3111
3112 PerlIO_printf(Perl_debug_log,
3113 "%s%s REx%s %s against %s\n",
3114 PL_colors[4], blurb, PL_colors[5], s0, s1);
3115
f2ed9b32 3116 if (utf8_target||utf8_pat)
1de06328
YO
3117 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3118 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3119 utf8_pat && utf8_target ? " and " : "",
3120 utf8_target ? "string" : ""
ab3bbdeb
YO
3121 );
3122 }
3123}
3df15adc
YO
3124
3125STATIC void
786e8c11
YO
3126S_dump_exec_pos(pTHX_ const char *locinput,
3127 const regnode *scan,
3128 const char *loc_regeol,
3129 const char *loc_bostr,
3130 const char *loc_reg_starttry,
f2ed9b32 3131 const bool utf8_target)
07be1b83 3132{
786e8c11 3133 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3134 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3135 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3136 /* The part of the string before starttry has one color
3137 (pref0_len chars), between starttry and current
3138 position another one (pref_len - pref0_len chars),
3139 after the current position the third one.
3140 We assume that pref0_len <= pref_len, otherwise we
3141 decrease pref0_len. */
786e8c11
YO
3142 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3143 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3144 int pref0_len;
3145
7918f24d
NC
3146 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3147
f2ed9b32 3148 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3149 pref_len++;
786e8c11
YO
3150 pref0_len = pref_len - (locinput - loc_reg_starttry);
3151 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3152 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3153 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3154 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3155 l--;
3156 if (pref0_len < 0)
3157 pref0_len = 0;
3158 if (pref0_len > pref_len)
3159 pref0_len = pref_len;
3160 {
f2ed9b32 3161 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3162
ab3bbdeb 3163 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3164 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3165
ab3bbdeb 3166 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3167 (locinput - pref_len + pref0_len),
1de06328 3168 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3169
ab3bbdeb 3170 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3171 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3172
1de06328 3173 const STRLEN tlen=len0+len1+len2;
3df15adc 3174 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3175 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3176 (IV)(locinput - loc_bostr),
07be1b83 3177 len0, s0,
07be1b83 3178 len1, s1,
07be1b83 3179 (docolor ? "" : "> <"),
07be1b83 3180 len2, s2,
f9f4320a 3181 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3182 "");
3183 }
3184}
3df15adc 3185
07be1b83
YO
3186#endif
3187
0a4db386
YO
3188/* reg_check_named_buff_matched()
3189 * Checks to see if a named buffer has matched. The data array of
3190 * buffer numbers corresponding to the buffer is expected to reside
3191 * in the regexp->data->data array in the slot stored in the ARG() of
3192 * node involved. Note that this routine doesn't actually care about the
3193 * name, that information is not preserved from compilation to execution.
3194 * Returns the index of the leftmost defined buffer with the given name
3195 * or 0 if non of the buffers matched.
3196 */
3197STATIC I32
7918f24d
NC
3198S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3199{
0a4db386 3200 I32 n;
f8fc2ecf 3201 RXi_GET_DECL(rex,rexi);
ad64d0ec 3202 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3203 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3204
3205 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3206
0a4db386 3207 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3208 if ((I32)rex->lastparen >= nums[n] &&
3209 rex->offs[nums[n]].end != -1)
0a4db386
YO
3210 {
3211 return nums[n];
3212 }
3213 }
3214 return 0;
3215}
3216
2f554ef7
DM
3217
3218/* free all slabs above current one - called during LEAVE_SCOPE */
3219
3220STATIC void
3221S_clear_backtrack_stack(pTHX_ void *p)
3222{
3223 regmatch_slab *s = PL_regmatch_slab->next;
3224 PERL_UNUSED_ARG(p);
3225
3226 if (!s)
3227 return;
3228 PL_regmatch_slab->next = NULL;
3229 while (s) {
3230 regmatch_slab * const osl = s;
3231 s = s->next;
3232 Safefree(osl);
3233 }
3234}
c74f6de9 3235static bool
984e6dd1
DM
3236S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3237 U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
c74f6de9 3238{
79a2a0e8
KW
3239 /* This function determines if there are one or two characters that match
3240 * the first character of the passed-in EXACTish node <text_node>, and if
3241 * so, returns them in the passed-in pointers.
c74f6de9 3242 *
79a2a0e8
KW
3243 * If it determines that no possible character in the target string can
3244 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3245 * the first character in <text_node> requires UTF-8 to represent, and the
3246 * target string isn't in UTF-8.)
c74f6de9 3247 *
79a2a0e8
KW
3248 * If there are more than two characters that could match the beginning of
3249 * <text_node>, or if more context is required to determine a match or not,
3250 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3251 *
3252 * The motiviation behind this function is to allow the caller to set up
3253 * tight loops for matching. If <text_node> is of type EXACT, there is
3254 * only one possible character that can match its first character, and so
3255 * the situation is quite simple. But things get much more complicated if
3256 * folding is involved. It may be that the first character of an EXACTFish
3257 * node doesn't participate in any possible fold, e.g., punctuation, so it
3258 * can be matched only by itself. The vast majority of characters that are
3259 * in folds match just two things, their lower and upper-case equivalents.
3260 * But not all are like that; some have multiple possible matches, or match
3261 * sequences of more than one character. This function sorts all that out.
3262 *
3263 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3264 * loop of trying to match A*, we know we can't exit where the thing
3265 * following it isn't a B. And something can't be a B unless it is the
3266 * beginning of B. By putting a quick test for that beginning in a tight
3267 * loop, we can rule out things that can't possibly be B without having to
3268 * break out of the loop, thus avoiding work. Similarly, if A is a single
3269 * character, we can make a tight loop matching A*, using the outputs of
3270 * this function.
3271 *
3272 * If the target string to match isn't in UTF-8, and there aren't
3273 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3274 * the one or two possible octets (which are characters in this situation)
3275 * that can match. In all cases, if there is only one character that can
3276 * match, *<c1p> and *<c2p> will be identical.
3277 *
3278 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3279 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3280 * can match the beginning of <text_node>. They should be declared with at
3281 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3282 * undefined what these contain.) If one or both of the buffers are
3283 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3284 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3285 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3286 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3287 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3288
3289 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3290
ddb0d839
KW
3291 UV c1 = CHRTEST_NOT_A_CP_1;
3292 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3293 bool use_chrtest_void = FALSE;
3294
3295 /* Used when we have both utf8 input and utf8 output, to avoid converting
3296 * to/from code points */
3297 bool utf8_has_been_setup = FALSE;
3298
c74f6de9
KW
3299 dVAR;
3300
b4291290 3301 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3302
79a2a0e8
KW
3303 if (OP(text_node) == EXACT) {
3304
3305 /* In an exact node, only one thing can be matched, that first
3306 * character. If both the pat and the target are UTF-8, we can just
3307 * copy the input to the output, avoiding finding the code point of
3308 * that character */
984e6dd1 3309 if (!is_utf8_pat) {
79a2a0e8
KW
3310 c2 = c1 = *pat;
3311 }
3312 else if (utf8_target) {
3313 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3314 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3315 utf8_has_been_setup = TRUE;
3316 }
3317 else {
3318 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3319 }
79a2a0e8
KW
3320 }
3321 else /* an EXACTFish node */
984e6dd1 3322 if ((is_utf8_pat
79a2a0e8
KW
3323 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3324 pat + STR_LEN(text_node)))
984e6dd1 3325 || (!is_utf8_pat
79a2a0e8
KW
3326 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3327 pat + STR_LEN(text_node))))
3328 {
3329 /* Multi-character folds require more context to sort out. Also
3330 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3331 * handled outside this routine */
3332 use_chrtest_void = TRUE;
3333 }
3334 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
984e6dd1 3335 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
79a2a0e8
KW
3336 if (c1 > 256) {
3337 /* Load the folds hash, if not already done */
3338 SV** listp;
3339 if (! PL_utf8_foldclosures) {
3340 if (! PL_utf8_tofold) {
3341 U8 dummy[UTF8_MAXBYTES+1];
3342
3343 /* Force loading this by folding an above-Latin1 char */
3344 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3345 assert(PL_utf8_tofold); /* Verify that worked */
3346 }
3347 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3348 }
3349
3350 /* The fold closures data structure is a hash with the keys being
3351 * the UTF-8 of every character that is folded to, like 'k', and
3352 * the values each an array of all code points that fold to its
3353 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3354 * not included */
3355 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3356 (char *) pat,
3357 UTF8SKIP(pat),
3358 FALSE))))
3359 {
3360 /* Not found in the hash, therefore there are no folds
3361 * containing it, so there is only a single character that
3362 * could match */
3363 c2 = c1;
3364 }
3365 else { /* Does participate in folds */
3366 AV* list = (AV*) *listp;
3367 if (av_len(list) != 1) {
3368
3369 /* If there aren't exactly two folds to this, it is outside
3370 * the scope of this function */
3371 use_chrtest_void = TRUE;
3372 }
3373 else { /* There are two. Get them */
3374 SV** c_p = av_fetch(list, 0, FALSE);
3375 if (c_p == NULL) {
3376 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3377 }
3378 c1 = SvUV(*c_p);
3379
3380 c_p = av_fetch(list, 1, FALSE);
3381 if (c_p == NULL) {
3382 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3383 }
3384 c2 = SvUV(*c_p);
3385
3386 /* Folds that cross the 255/256 boundary are forbidden if
3387 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3388 * pattern character is above 256, and its only other match
3389 * is below 256, the only legal match will be to itself.
3390 * We have thrown away the original, so have to compute
3391 * which is the one above 255 */
3392 if ((c1 < 256) != (c2 < 256)) {
3393 if (OP(text_node) == EXACTFL
3394 || (OP(text_node) == EXACTFA
3395 && (isASCII(c1) || isASCII(c2))))
3396 {
3397 if (c1 < 256) {
3398 c1 = c2;
3399 }
3400 else {
3401 c2 = c1;
3402 }
3403 }
3404 }
3405 }
3406 }
3407 }
3408 else /* Here, c1 is < 255 */
3409 if (utf8_target
3410 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3411 && OP(text_node) != EXACTFL
3412 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3413 {
3414 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3415 * folds to this character in the pattern. All such cases except
3416 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3417 * involved in their folds, so are outside the scope of this
3418 * function */
3419 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3420 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3421 }
3422 else {
3423 use_chrtest_void = TRUE;
3424 }
c74f6de9
KW
3425 }
3426 else { /* Here nothing above Latin1 can fold to the pattern character */
3427 switch (OP(text_node)) {
3428
3429 case EXACTFL: /* /l rules */
79a2a0e8 3430 c2 = PL_fold_locale[c1];
c74f6de9
KW
3431 break;
3432
3433 case EXACTF:
3434 if (! utf8_target) { /* /d rules */
79a2a0e8 3435 c2 = PL_fold[c1];
c74f6de9
KW
3436 break;
3437 }
3438 /* FALLTHROUGH */
3439 /* /u rules for all these. This happens to work for
79a2a0e8 3440 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3441 case EXACTFA:
3442 case EXACTFU_TRICKYFOLD:
79a2a0e8 3443 case EXACTFU_SS:
c74f6de9 3444 case EXACTFU:
79a2a0e8 3445 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3446 break;
3447
878531d3
KW
3448 default:
3449 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3450 assert(0); /* NOTREACHED */
c74f6de9
KW
3451 }
3452 }
3453 }
79a2a0e8
KW
3454
3455 /* Here have figured things out. Set up the returns */
3456 if (use_chrtest_void) {
3457 *c2p = *c1p = CHRTEST_VOID;
3458 }
3459 else if (utf8_target) {
3460 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3461 uvchr_to_utf8(c1_utf8, c1);
3462 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3463 }
c74f6de9 3464
79a2a0e8
KW
3465 /* Invariants are stored in both the utf8 and byte outputs; Use
3466 * negative numbers otherwise for the byte ones. Make sure that the
3467 * byte ones are the same iff the utf8 ones are the same */
3468 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3469 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3470 ? *c2_utf8
3471 : (c1 == c2)
3472 ? CHRTEST_NOT_A_CP_1
3473 : CHRTEST_NOT_A_CP_2;
3474 }
3475 else if (c1 > 255) {
3476 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3477 can represent */
3478 return FALSE;
3479 }
c74f6de9 3480
79a2a0e8
KW
3481 *c1p = *c2p = c2; /* c2 is the only representable value */
3482 }
3483 else { /* c1 is representable; see about c2 */
3484 *c1p = c1;
3485 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3486 }
2f554ef7 3487
c74f6de9
KW
3488 return TRUE;
3489}
2f554ef7 3490
f73aaa43
DM
3491/* returns -1 on failure, $+[0] on success */
3492STATIC I32
3493S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3494{
a35a87e7 3495#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3496 dMY_CXT;
3497#endif
27da23d5 3498 dVAR;
eb578fdb 3499 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3500 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3501 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3502 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3503 RXi_GET_DECL(rex,rexi);
2f554ef7 3504 I32 oldsave;
5d9a96ca 3505 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3506 regmatch_state *st;
5d9a96ca 3507 /* cache heavy used fields of st in registers */
eb578fdb
KW
3508 regnode *scan;
3509 regnode *next;
3510 U32 n = 0; /* general value; init to avoid compiler warning */
3511 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3512 char *locinput = startpos;
4d5016e5 3513 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3514 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3515
b69b0499 3516 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3517 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3518 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3519 const U32 max_nochange_depth =
3520 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3521 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3522 regmatch_state *yes_state = NULL; /* state to pop to on success of
3523 subpattern */
e2e6a0f1
YO
3524 /* mark_state piggy backs on the yes_state logic so that when we unwind
3525 the stack on success we can update the mark_state as we go */
3526 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3527 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3528 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3529 U32 state_num;
5d458dd8
YO
3530 bool no_final = 0; /* prevent failure from backtracking? */
3531 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3532 char *startpoint = locinput;
5d458dd8
YO
3533 SV *popmark = NULL; /* are we looking for a mark? */
3534 SV *sv_commit = NULL; /* last mark name seen in failure */
3535 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3536 during a successful match */
5d458dd8
YO
3537 U32 lastopen = 0; /* last open we saw */
3538 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3539 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3540 /* these three flags are set by various ops to signal information to
3541 * the very next op. They have a useful lifetime of exactly one loop
3542 * iteration, and are not preserved or restored by state pushes/pops
3543 */
3544 bool sw = 0; /* the condition value in (?(cond)a|b) */
3545 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3546 int logical = 0; /* the following EVAL is:
3547 0: (?{...})
3548 1: (?(?{...})X|Y)
3549 2: (??{...})
3550 or the following IFMATCH/UNLESSM is:
3551 false: plain (?=foo)
3552 true: used as a condition: (?(?=foo))
3553 */
81ed78b2
DM
3554 PAD* last_pad = NULL;
3555 dMULTICALL;
3556 I32 gimme = G_SCALAR;
3557 CV *caller_cv = NULL; /* who called us */
3558 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3559 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3560 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
3561 int to_complement; /* Invert the result? */
3562 _char_class_number classnum;
984e6dd1 3563 bool is_utf8_pat = reginfo->is_utf8_pat;
81ed78b2 3564
95b24440 3565#ifdef DEBUGGING
e68ec53f 3566 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3567#endif
3568
81ed78b2
DM
3569 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3570 multicall_oldcatch = 0;
3571 multicall_cv = NULL;
3572 cx = NULL;
4f8dbb2d
JL
3573 PERL_UNUSED_VAR(multicall_cop);
3574 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3575
3576
7918f24d
NC
3577 PERL_ARGS_ASSERT_REGMATCH;
3578
3b57cd43 3579 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3580 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3581 }));
5d9a96ca
DM
3582 /* on first ever call to regmatch, allocate first slab */
3583 if (!PL_regmatch_slab) {
3584 Newx(PL_regmatch_slab, 1, regmatch_slab);
3585 PL_regmatch_slab->prev = NULL;
3586 PL_regmatch_slab->next = NULL;
86545054 3587 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3588 }
3589
2f554ef7
DM
3590 oldsave = PL_savestack_ix;
3591 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3592 SAVEVPTR(PL_regmatch_slab);
3593 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3594
3595 /* grab next free state slot */
3596 st = ++PL_regmatch_state;
86545054 3597 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3598 st = PL_regmatch_state = S_push_slab(aTHX);
3599
d6a28714 3600 /* Note that nextchr is a byte even in UTF */
7016d6eb 3601 SET_nextchr;
d6a28714
JH
3602 scan = prog;
3603 while (scan != NULL) {
8ba1375e 3604
a3621e74 3605 DEBUG_EXECUTE_r( {
6136c704 3606 SV * const prop = sv_newmortal();
1de06328 3607 regnode *rnext=regnext(scan);
f2ed9b32 3608 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3609 regprop(rex, prop, scan);
07be1b83
YO
3610
3611 PerlIO_printf(Perl_debug_log,
3612 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3613 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3614 SvPVX_const(prop),
1de06328 3615 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3616 0 : (IV)(rnext - rexi->program));
2a782b5b 3617 });
d6a28714
JH
3618
3619 next = scan + NEXT_OFF(scan);
3620 if (next == scan)
3621 next = NULL;
40a82448 3622 state_num = OP(scan);
d6a28714 3623
40a82448 3624 reenter_switch:
3018b823 3625 to_complement = 0;
34a81e2b 3626
7016d6eb 3627 SET_nextchr;
e6ca698c 3628 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 3629
40a82448 3630 switch (state_num) {
3c0563b9 3631 case BOL: /* /^../ */
7fba1cd6 3632 if (locinput == PL_bostr)
d6a28714 3633 {
3b0527fe 3634 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3635 break;
3636 }
d6a28714 3637 sayNO;
3c0563b9
DM
3638
3639 case MBOL: /* /^../m */
12d33761 3640 if (locinput == PL_bostr ||
7016d6eb 3641 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3642 {
b8c5462f
JH
3643 break;
3644 }
d6a28714 3645 sayNO;
3c0563b9
DM
3646
3647 case SBOL: /* /^../s */
c2a73568 3648 if (locinput == PL_bostr)
b8c5462f 3649 break;
d6a28714 3650 sayNO;
3c0563b9
DM
3651
3652 case GPOS: /* \G */
3b0527fe 3653 if (locinput == reginfo->ganch)
d6a28714
JH
3654 break;
3655 sayNO;
ee9b8eae 3656
3c0563b9 3657 case KEEPS: /* \K */
ee9b8eae 3658 /* update the startpoint */
b93070ed 3659 st->u.keeper.val = rex->offs[0].start;
b93070ed 3660 rex->offs[0].start = locinput - PL_bostr;
4d5016e5 3661 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
878531d3 3662 assert(0); /*NOTREACHED*/
ee9b8eae
YO
3663 case KEEPS_next_fail:
3664 /* rollback the start point change */
b93070ed 3665 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 3666 sayNO_SILENT;
878531d3 3667 assert(0); /*NOTREACHED*/
3c0563b9
DM
3668
3669 case EOL: /* /..$/ */
d6a28714 3670 goto seol;
3c0563b9
DM
3671
3672 case MEOL: /* /..$/m */
7016d6eb 3673 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3674 sayNO;
b8c5462f 3675 break;
3c0563b9
DM
3676
3677 case SEOL: /* /..$/s */
d6a28714 3678 seol:
7016d6eb 3679 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3680 sayNO;
d6a28714 3681 if (PL_regeol - locinput > 1)
b8c5462f 3682 sayNO;
b8c5462f 3683 break;
3c0563b9
DM
3684
3685 case EOS: /* \z */
7016d6eb 3686 if (!NEXTCHR_IS_EOS)
b8c5462f 3687 sayNO;
d6a28714 3688 break;
3c0563b9
DM
3689
3690 case SANY: /* /./s */
7016d6eb 3691 if (NEXTCHR_IS_EOS)
4633a7c4 3692 sayNO;
28b98f76 3693 goto increment_locinput;
3c0563b9
DM
3694
3695 case CANY: /* \C */
7016d6eb 3696 if (NEXTCHR_IS_EOS)
f33976b4 3697 sayNO;
3640db6b 3698 locinput++;
a0d0e21e 3699 break;
3c0563b9
DM
3700
3701 case REG_ANY: /* /./ */
7016d6eb 3702 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 3703 sayNO;
28b98f76
DM
3704 goto increment_locinput;
3705
166ba7cd
DM
3706
3707#undef ST
3708#define ST st->u.trie
3c0563b9 3709 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
3710 /* In this case the charclass data is available inline so
3711 we can fail fast without a lot of extra overhead.
3712 */
7016d6eb 3713 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3714 DEBUG_EXECUTE_r(
3715 PerlIO_printf(Perl_debug_log,
3716 "%*s %sfailed to match trie start class...%s\n",
3717 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3718 );
3719 sayNO_SILENT;
118e2215 3720 assert(0); /* NOTREACHED */
786e8c11
YO
3721 }
3722 /* FALL THROUGH */
3c0563b9 3723 case TRIE: /* (ab|cd) */
2e64971a
DM
3724 /* the basic plan of execution of the trie is:
3725 * At the beginning, run though all the states, and
3726 * find the longest-matching word. Also remember the position
3727 * of the shortest matching word. For example, this pattern:
3728 * 1 2 3 4 5
3729 * ab|a|x|abcd|abc
3730 * when matched against the string "abcde", will generate
3731 * accept states for all words except 3, with the longest
895cc420 3732 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
3733 * the position being after char 1 of the string).
3734 *
3735 * Then for each matching word, in word order (i.e. 1,2,4,5),
3736 * we run the remainder of the pattern; on each try setting
3737 * the current position to the character following the word,
3738 * returning to try the next word on failure.
3739 *
3740 * We avoid having to build a list of words at runtime by
3741 * using a compile-time structure, wordinfo[].prev, which
3742 * gives, for each word, the previous accepting word (if any).
3743 * In the case above it would contain the mappings 1->2, 2->0,
3744 * 3->0, 4->5, 5->1. We can use this table to generate, from
3745 * the longest word (4 above), a list of all words, by
3746 * following the list of prev pointers; this gives us the
3747 * unordered list 4,5,1,2. Then given the current word we have
3748 * just tried, we can go through the list and find the
3749 * next-biggest word to try (so if we just failed on word 2,
3750 * the next in the list is 4).
3751 *
3752 * Since at runtime we don't record the matching position in
3753 * the string for each word, we have to work that out for
3754 * each word we're about to process. The wordinfo table holds
3755 * the character length of each word; given that we recorded
3756 * at the start: the position of the shortest word and its
3757 * length in chars, we just need to move the pointer the
3758 * difference between the two char lengths. Depending on
3759 * Unicode status and folding, that's cheap or expensive.
3760 *
3761 * This algorithm is optimised for the case where are only a
3762 * small number of accept states, i.e. 0,1, or maybe 2.
3763 * With lots of accepts states, and having to try all of them,
3764 * it becomes quadratic on number of accept states to find all
3765 * the next words.
3766 */
3767
3dab1dad 3768 {
07be1b83 3769 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3770 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3771
3772 /* what trie are we using right now */
be8e71aa 3773 reg_trie_data * const trie
f8fc2ecf 3774 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3775 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3776 U32 state = trie->startstate;
166ba7cd 3777
7016d6eb
DM
3778 if ( trie->bitmap
3779 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3780 {
3dab1dad
YO
3781 if (trie->states[ state ].wordnum) {
3782 DEBUG_EXECUTE_r(
3783 PerlIO_printf(Perl_debug_log,
3784 "%*s %smatched empty string...%s\n",
5bc10b2c 3785 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3786 );
20dbff7c
YO
3787 if (!trie->jump)
3788 break;
3dab1dad
YO
3789 } else {
3790 DEBUG_EXECUTE_r(
3791 PerlIO_printf(Perl_debug_log,
786e8c11 3792 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3793 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3794 );
3795 sayNO_SILENT;
3796 }
3797 }
166ba7cd 3798
786e8c11
YO
3799 {
3800 U8 *uc = ( U8* )locinput;
3801
3802 STRLEN len = 0;
3803 STRLEN foldlen = 0;
3804 U8 *uscan = (U8*)NULL;
786e8c11 3805 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3806 U32 charcount = 0; /* how many input chars we have matched */
3807 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3808
786e8c11 3809 ST.jump = trie->jump;
786e8c11 3810 ST.me = scan;
2e64971a
DM
3811 ST.firstpos = NULL;
3812 ST.longfold = FALSE; /* char longer if folded => it's harder */
3813 ST.nextword = 0;
3814
3815 /* fully traverse the TRIE; note the position of the
3816 shortest accept state and the wordnum of the longest
3817 accept state */
07be1b83 3818
a3621e74 3819 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3820 U32 base = trie->states[ state ].trans.base;
f9f4320a 3821 UV uvc = 0;
acb909b4 3822 U16 charid = 0;
2e64971a
DM
3823 U16 wordnum;
3824 wordnum = trie->states[ state ].wordnum;
3825
3826 if (wordnum) { /* it's an accept state */
3827 if (!accepted) {
3828 accepted = 1;
3829 /* record first match position */
3830 if (ST.longfold) {
3831 ST.firstpos = (U8*)locinput;
3832 ST.firstchars = 0;
5b47454d 3833 }
2e64971a
DM
3834 else {
3835 ST.firstpos = uc;
3836 ST.firstchars = charcount;
3837 }
3838 }
3839 if (!ST.nextword || wordnum < ST.nextword)
3840 ST.nextword = wordnum;
3841 ST.topword = wordnum;
786e8c11 3842 }
a3621e74 3843
07be1b83 3844 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3845 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3846 PerlIO_printf( Perl_debug_log,
2e64971a 3847 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3848 2+depth * 2, "", PL_colors[4],
2e64971a 3849 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3850 });
a3621e74 3851
2e64971a 3852 /* read a char and goto next state */
7016d6eb 3853 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
6dd2be57 3854 I32 offset;
55eed653
NC
3855 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3856 uscan, len, uvc, charid, foldlen,
3857 foldbuf, uniflags);
2e64971a
DM
3858 charcount++;
3859 if (foldlen>0)
3860 ST.longfold = TRUE;
5b47454d 3861 if (charid &&
6dd2be57
DM
3862 ( ((offset =
3863 base + charid - 1 - trie->uniquecharcount)) >= 0)
3864
3865 && ((U32)offset < trie->lasttrans)
3866 && trie->trans[offset].check == state)
5b47454d 3867 {
6dd2be57 3868 state = trie->trans[offset].next;
5b47454d
DM
3869 }
3870 else {
3871 state = 0;
3872 }
3873 uc += len;
3874
3875 }
3876 else {
a3621e74
YO
3877 state = 0;
3878 }
3879 DEBUG_TRIE_EXECUTE_r(
e4584336 3880 PerlIO_printf( Perl_debug_log,
786e8c11 3881 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3882 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3883 );
3884 }
2e64971a 3885 if (!accepted)
a3621e74 3886 sayNO;
a3621e74 3887
2e64971a
DM
3888 /* calculate total number of accept states */
3889 {
3890 U16 w = ST.topword;
3891 accepted = 0;
3892 while (w) {
3893 w = trie->wordinfo[w].prev;
3894 accepted++;
3895 }
3896 ST.accepted = accepted;
3897 }
3898
166ba7cd
DM
3899 DEBUG_EXECUTE_r(
3900 PerlIO_printf( Perl_debug_log,
3901 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3902 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3903 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3904 );
2e64971a 3905 goto trie_first_try; /* jump into the fail handler */
786e8c11 3906 }}
118e2215 3907 assert(0); /* NOTREACHED */
2e64971a
DM
3908
3909 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
3910 {
3911 U8 *uc;
fae667d5
YO
3912 if ( ST.jump) {
3913 REGCP_UNWIND(ST.cp);
a8d1f4b4 3914 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3915 }
2e64971a
DM
3916 if (!--ST.accepted) {
3917 DEBUG_EXECUTE_r({
3918 PerlIO_printf( Perl_debug_log,
3919 "%*s %sTRIE failed...%s\n",
3920 REPORT_CODE_OFF+depth*2, "",
3921 PL_colors[4],
3922 PL_colors[5] );
3923 });
3924 sayNO_SILENT;
3925 }
3926 {
3927 /* Find next-highest word to process. Note that this code
3928 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
3929 U16 min = 0;
3930 U16 word;
3931 U16 const nextword = ST.nextword;
3932 reg_trie_wordinfo * const wordinfo
2e64971a
DM
3933 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3934 for (word=ST.topword; word; word=wordinfo[word].prev) {
3935 if (word > nextword && (!min || word < min))
3936 min = word;
3937 }
3938 ST.nextword = min;
3939 }
3940
fae667d5 3941 trie_first_try:
5d458dd8
YO
3942 if (do_cutgroup) {
3943 do_cutgroup = 0;
3944 no_final = 0;
3945 }
fae667d5
YO
3946
3947 if ( ST.jump) {
b93070ed 3948 ST.lastparen = rex->lastparen;
f6033a9d 3949 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3950 REGCP_SET(ST.cp);
2e64971a 3951 }
a3621e74 3952
2e64971a 3953 /* find start char of end of current word */
166ba7cd 3954 {
2e64971a 3955 U32 chars; /* how many chars to skip */
2e64971a
DM
3956 reg_trie_data * const trie
3957 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3958
3959 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3960 >= ST.firstchars);
3961 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3962 - ST.firstchars;
a059a757 3963 uc = ST.firstpos;
2e64971a
DM
3964
3965 if (ST.longfold) {
3966 /* the hard option - fold each char in turn and find
3967 * its folded length (which may be different */
3968 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3969 STRLEN foldlen;
3970 STRLEN len;
d9a396a3 3971 UV uvc;
2e64971a
DM
3972 U8 *uscan;
3973
3974 while (chars) {
f2ed9b32 3975 if (utf8_target) {
2e64971a
DM
3976 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3977 uniflags);
3978 uc += len;
3979 }
3980 else {
3981 uvc = *uc;
3982 uc++;
3983 }
3984 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3985 uscan = foldbuf;
3986 while (foldlen) {
3987 if (!--chars)
3988 break;
3989 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3990 uniflags);
3991 uscan += len;
3992 foldlen -= len;
3993 }
3994 }
a3621e74 3995 }
2e64971a 3996 else {
f2ed9b32 3997 if (utf8_target)
2e64971a
DM
3998 while (chars--)
3999 uc += UTF8SKIP(uc);
4000 else
4001 uc += chars;
4002 }
2e64971a 4003 }
166ba7cd 4004
6603fe3e
DM
4005 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4006 ? ST.jump[ST.nextword]
4007 : NEXT_OFF(ST.me));
166ba7cd 4008
2e64971a
DM
4009 DEBUG_EXECUTE_r({
4010 PerlIO_printf( Perl_debug_log,
4011 "%*s %sTRIE matched word #%d, continuing%s\n",
4012 REPORT_CODE_OFF+depth*2, "",
4013 PL_colors[4],
4014 ST.nextword,
4015 PL_colors[5]
4016 );
4017 });
4018
4019 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4020 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 4021 assert(0); /* NOTREACHED */
166ba7cd 4022 }
2e64971a
DM
4023 /* only one choice left - just continue */
4024 DEBUG_EXECUTE_r({
4025 AV *const trie_words
4026 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4027 SV ** const tmp = av_fetch( trie_words,
4028 ST.nextword-1, 0 );
4029 SV *sv= tmp ? sv_newmortal() : NULL;
4030
4031 PerlIO_printf( Perl_debug_log,
4032 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4033 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4034 ST.nextword,
4035 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4036 PL_colors[0], PL_colors[1],
c89df6cf 4037 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4038 )
4039 : "not compiled under -Dr",
4040 PL_colors[5] );
4041 });
4042
a059a757 4043 locinput = (char*)uc;
2e64971a 4044 continue; /* execute rest of RE */
118e2215 4045 assert(0); /* NOTREACHED */
a059a757 4046 }
166ba7cd
DM
4047#undef ST
4048
3c0563b9 4049 case EXACT: { /* /abc/ */
95b24440 4050 char *s = STRING(scan);
24d3c4a9 4051 ln = STR_LEN(scan);
984e6dd1 4052 if (utf8_target != is_utf8_pat) {
bc517b45 4053 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4054 char *l = locinput;
24d3c4a9 4055 const char * const e = s + ln;
a72c7584 4056
f2ed9b32 4057 if (utf8_target) {
e6a3850e
KW
4058 /* The target is utf8, the pattern is not utf8.
4059 * Above-Latin1 code points can't match the pattern;
4060 * invariants match exactly, and the other Latin1 ones need
4061 * to be downgraded to a single byte in order to do the
4062 * comparison. (If we could be confident that the target
4063 * is not malformed, this could be refactored to have fewer
4064 * tests by just assuming that if the first bytes match, it
4065 * is an invariant, but there are tests in the test suite
4066 * dealing with (??{...}) which violate this) */
1aa99e6b 4067 while (s < e) {
5ac65bff 4068 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
e6a3850e
KW
4069 sayNO;
4070 }
4071 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4072 if (*l != *s) {
4073 sayNO;
4074 }
4075 l++;
4076 }
4077 else {
4078 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4079 sayNO;
4080 }
4081 l += 2;
4082 }
4083 s++;
1aa99e6b 4084 }
5ff6fc6d
JH
4085 }
4086 else {
4087 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4088 while (s < e) {
e6a3850e
KW
4089 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4090 {
4091 sayNO;
4092 }
4093 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4094 if (*s != *l) {
4095 sayNO;
4096 }
4097 s++;
4098 }
4099 else {
4100 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4101 sayNO;
4102 }
4103 s += 2;
4104 }
4105 l++;
1aa99e6b 4106 }
5ff6fc6d 4107 }
1aa99e6b 4108 locinput = l;
1aa99e6b 4109 }
5ac65bff
KW
4110 else {
4111 /* The target and the pattern have the same utf8ness. */
4112 /* Inline the first character, for speed. */
4113 if (PL_regeol - locinput < ln
4114 || UCHARAT(s) != nextchr
4115 || (ln > 1 && memNE(s, locinput, ln)))
4116 {
4117 sayNO;
4118 }
4119 locinput += ln;
4120 }
d6a28714 4121 break;
95b24440 4122 }
7016d6eb 4123
3c0563b9 4124 case EXACTFL: { /* /abc/il */
a932d541 4125 re_fold_t folder;
9a5a5549
KW
4126 const U8 * fold_array;
4127 const char * s;
d513472c 4128 U32 fold_utf8_flags;
9a5a5549 4129
272d35c9 4130 RX_MATCH_TAINTED_on(reginfo->prog);
f67f9e53
KW
4131 folder = foldEQ_locale;
4132 fold_array = PL_fold_locale;
17580e7a 4133 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
4134 goto do_exactf;
4135
3c0563b9
DM
4136 case EXACTFU_SS: /* /\x{df}/iu */
4137 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4138 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4139 folder = foldEQ_latin1;
4140 fold_array = PL_fold_latin1;
984e6dd1 4141 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4142 goto do_exactf;
4143
3c0563b9 4144 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4145 folder = foldEQ_latin1;
4146 fold_array = PL_fold_latin1;
57014d77 4147 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4148 goto do_exactf;
4149
3c0563b9 4150 case EXACTF: /* /abc/i */
9a5a5549
KW
4151 folder = foldEQ;
4152 fold_array = PL_fold;
62bf7766 4153 fold_utf8_flags = 0;
9a5a5549
KW
4154
4155 do_exactf:
4156 s = STRING(scan);
24d3c4a9 4157 ln = STR_LEN(scan);
d6a28714 4158
984e6dd1 4159 if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
3c760661
KW
4160 /* Either target or the pattern are utf8, or has the issue where
4161 * the fold lengths may differ. */
be8e71aa 4162 const char * const l = locinput;
d07ddd77 4163 char *e = PL_regeol;
bc517b45 4164
984e6dd1 4165 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 4166 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4167 {
4168 sayNO;
5486206c 4169 }
d07ddd77 4170 locinput = e;
d07ddd77 4171 break;
a0ed51b3 4172 }
d6a28714 4173
0a138b74 4174 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4175 if (UCHARAT(s) != nextchr
4176 && !NEXTCHR_IS_EOS
4177 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4178 {
a0ed51b3 4179 sayNO;
9a5a5549 4180 }
24d3c4a9 4181 if (PL_regeol - locinput < ln)
b8c5462f 4182 sayNO;
9a5a5549 4183 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4184 sayNO;
24d3c4a9 4185 locinput += ln;
a0d0e21e 4186 break;
9a5a5549 4187 }
63ac0dad
KW
4188
4189 /* XXX Could improve efficiency by separating these all out using a
4190 * macro or in-line function. At that point regcomp.c would no longer
4191 * have to set the FLAGS fields of these */
3c0563b9
DM
4192 case BOUNDL: /* /\b/l */
4193 case NBOUNDL: /* /\B/l */
272d35c9 4194 RX_MATCH_TAINTED_on(reginfo->prog);
b2680017 4195 /* FALL THROUGH */
3c0563b9
DM
4196 case BOUND: /* /\b/ */
4197 case BOUNDU: /* /\b/u */
4198 case BOUNDA: /* /\b/a */
4199 case NBOUND: /* /\B/ */
4200 case NBOUNDU: /* /\B/u */
4201 case NBOUNDA: /* /\B/a */
b2680017 4202 /* was last char in word? */
f2e96b5d
KW
4203 if (utf8_target
4204 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4205 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4206 {
b2680017
YO
4207 if (locinput == PL_bostr)
4208 ln = '\n';
4209 else {
4210 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4211
4212 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4213 }
63ac0dad 4214 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
0eb30aeb 4215 ln = isWORDCHAR_uni(ln);
7016d6eb
DM
4216 if (NEXTCHR_IS_EOS)
4217 n = 0;
4218 else {
4219 LOAD_UTF8_CHARCLASS_ALNUM();
03940dc2 4220 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
7016d6eb
DM
4221 utf8_target);
4222 }
b2680017
YO
4223 }
4224 else {
0eb30aeb
KW
4225 ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4226 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
b2680017
YO
4227 }
4228 }
4229 else {
cfaf538b
KW
4230
4231 /* Here the string isn't utf8, or is utf8 and only ascii
4232 * characters are to match \w. In the latter case looking at
4233 * the byte just prior to the current one may be just the final
4234 * byte of a multi-byte character. This is ok. There are two
4235 * cases:
4236 * 1) it is a single byte character, and then the test is doing
4237 * just what it's supposed to.
4238 * 2) it is a multi-byte character, in which case the final
4239 * byte is never mistakable for ASCII, and so the test
4240 * will say it is not a word character, which is the
4241 * correct answer. */
b2680017
YO
4242 ln = (locinput != PL_bostr) ?
4243 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4244 switch (FLAGS(scan)) {
4245 case REGEX_UNICODE_CHARSET:
4246 ln = isWORDCHAR_L1(ln);
7016d6eb 4247 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4248 break;
4249 case REGEX_LOCALE_CHARSET:
0eb30aeb
KW
4250 ln = isWORDCHAR_LC(ln);
4251 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
63ac0dad
KW
4252 break;
4253 case REGEX_DEPENDS_CHARSET:
0eb30aeb
KW
4254 ln = isWORDCHAR(ln);
4255 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
63ac0dad 4256 break;
cfaf538b 4257 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4258 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4259 ln = isWORDCHAR_A(ln);
7016d6eb 4260 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4261 break;
63ac0dad
KW
4262 default:
4263 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4264 break;
b2680017
YO
4265 }
4266 }
63ac0dad
KW
4267 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4268 * regcomp.sym */
4269 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4270 sayNO;
4271 break;
3c0563b9 4272
3c0563b9 4273 case ANYOF: /* /[abc]/ */
954a2af6 4274 case ANYOF_WARN_SUPER:
7016d6eb
DM
4275 if (NEXTCHR_IS_EOS)
4276 sayNO;
e0193e47 4277 if (utf8_target) {
635cd5d4 4278 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
09b08e9b 4279 sayNO;
635cd5d4 4280 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
4281 }
4282 else {
20ed0b26 4283 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4284 sayNO;
3640db6b 4285 locinput++;
e0f9d4a8 4286 }
b8c5462f 4287 break;
3c0563b9 4288
3018b823
KW
4289 /* The argument (FLAGS) to all the POSIX node types is the class number
4290 * */
ee9a90b8 4291
3018b823
KW
4292 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4293 to_complement = 1;
4294 /* FALLTHROUGH */
4295
4296 case POSIXL: /* \w or [:punct:] etc. under /l */
4297 if (NEXTCHR_IS_EOS)
bedac28b 4298 sayNO;
bedac28b 4299
3018b823
KW
4300 /* The locale hasn't influenced the outcome before this, so defer
4301 * tainting until now */
272d35c9 4302 RX_MATCH_TAINTED_on(reginfo->prog);
3018b823
KW
4303
4304 /* Use isFOO_lc() for characters within Latin1. (Note that
4305 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4306 * wouldn't be invariant) */
4307 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 4308 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
4309 sayNO;
4310 }
4311 }
3018b823
KW
4312 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4313 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
eb4e9c04 4314 (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
3018b823
KW
4315 *(locinput + 1))))))
4316 {
bedac28b 4317 sayNO;
3018b823 4318 }
bedac28b 4319 }
3018b823
KW
4320 else { /* Here, must be an above Latin-1 code point */
4321 goto utf8_posix_not_eos;
bedac28b 4322 }
3018b823
KW
4323
4324 /* Here, must be utf8 */
4325 locinput += UTF8SKIP(locinput);
bedac28b
KW
4326 break;
4327
3018b823
KW
4328 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4329 to_complement = 1;
4330 /* FALLTHROUGH */
4331
4332 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 4333 if (utf8_target) {
3018b823 4334 goto utf8_posix;
bedac28b 4335 }
3018b823
KW
4336 goto posixa;
4337
4338 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 4339
3018b823 4340 if (NEXTCHR_IS_EOS) {
bedac28b
KW
4341 sayNO;
4342 }
bedac28b 4343
3018b823
KW
4344 /* All UTF-8 variants match */
4345 if (! UTF8_IS_INVARIANT(nextchr)) {
4346 goto increment_locinput;
bedac28b 4347 }
ee9a90b8 4348
3018b823
KW
4349 to_complement = 1;
4350 /* FALLTHROUGH */
4351
4352 case POSIXA: /* \w or [:punct:] etc. under /a */
4353
4354 posixa:
4355 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4356 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4357 * character is a single byte */
20d0b1e9 4358
3018b823
KW
4359 if (NEXTCHR_IS_EOS
4360 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4361 FLAGS(scan)))))
4362 {
0658cdde
KW
4363 sayNO;
4364 }
3018b823
KW
4365
4366 /* Here we are either not in utf8, or we matched a utf8-invariant,
4367 * so the next char is the next byte */
3640db6b 4368 locinput++;
0658cdde 4369 break;
3c0563b9 4370
3018b823
KW
4371 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4372 to_complement = 1;
4373 /* FALLTHROUGH */
4374
4375 case POSIXU: /* \w or [:punct:] etc. under /u */
4376 utf8_posix:
4377 if (NEXTCHR_IS_EOS) {
0658cdde
KW
4378 sayNO;
4379 }
3018b823
KW
4380 utf8_posix_not_eos:
4381
4382 /* Use _generic_isCC() for characters within Latin1. (Note that
4383 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4384 * wouldn't be invariant) */
4385 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4386 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4387 FLAGS(scan)))))
4388 {
4389 sayNO;
4390 }
4391 locinput++;
4392 }
4393 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4394 if (! (to_complement
4395 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4396 *(locinput + 1)),
4397 FLAGS(scan)))))
4398 {
4399 sayNO;
4400 }
4401 locinput += 2;
4402 }
4403 else { /* Handle above Latin-1 code points */
4404 classnum = (_char_class_number) FLAGS(scan);
4405 if (classnum < _FIRST_NON_SWASH_CC) {
4406
4407 /* Here, uses a swash to find such code points. Load if if
4408 * not done already */
4409 if (! PL_utf8_swash_ptrs[classnum]) {
4410 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4411 PL_utf8_swash_ptrs[classnum]
4412 = _core_swash_init("utf8",
4413 swash_property_names[classnum],
4414 &PL_sv_undef, 1, 0, NULL, &flags);
4415 }
4416 if (! (to_complement
4417 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4418 (U8 *) locinput, TRUE))))
4419 {
4420 sayNO;
4421 }
4422 }
4423 else { /* Here, uses macros to find above Latin-1 code points */
4424 switch (classnum) {
4425 case _CC_ENUM_SPACE: /* XXX would require separate
4426 code if we revert the change
4427 of \v matching this */
4428 case _CC_ENUM_PSXSPC:
4429 if (! (to_complement
4430 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4431 {
4432 sayNO;
4433 }
4434 break;
4435 case _CC_ENUM_BLANK:
4436 if (! (to_complement
4437 ^ cBOOL(is_HORIZWS_high(locinput))))
4438 {
4439 sayNO;
4440 }
4441 break;
4442 case _CC_ENUM_XDIGIT:
4443 if (! (to_complement
4444 ^ cBOOL(is_XDIGIT_high(locinput))))
4445 {
4446 sayNO;
4447 }
4448 break;
4449 case _CC_ENUM_VERTSPACE:
4450 if (! (to_complement
4451 ^ cBOOL(is_VERTWS_high(locinput))))
4452 {
4453 sayNO;
4454 }
4455 break;
4456 default: /* The rest, e.g. [:cntrl:], can't match
4457 above Latin1 */
4458 if (! to_complement) {
4459 sayNO;
4460 }
4461 break;
4462 }
4463 }
4464 locinput += UTF8SKIP(locinput);
4465 }
4466 break;
0658cdde 4467
37e2e78e
KW
4468 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4469 a Unicode extended Grapheme Cluster */
4470 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4471 extended Grapheme Cluster is:
4472
7aee35ff
KW
4473 CR LF
4474 | Prepend* Begin Extend*
4475 | .
37e2e78e 4476
7aee35ff
KW
4477 Begin is: ( Special_Begin | ! Control )
4478 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4479 Extend is: ( Grapheme_Extend | Spacing_Mark )
4480 Control is: [ GCB_Control | CR | LF ]
4481 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4482
27d4fc33
KW
4483 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4484 we can rewrite
4485
4486 Begin is ( Regular_Begin + Special Begin )
4487
4488 It turns out that 98.4% of all Unicode code points match
4489 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4490 the previous implementation for almost all Unicode code points.
27d4fc33 4491
37e2e78e
KW
4492 There is a subtlety with Prepend* which showed up in testing.
4493 Note that the Begin, and only the Begin is required in:
4494 | Prepend* Begin Extend*
cc3b396d
KW
4495 Also, Begin contains '! Control'. A Prepend must be a
4496 '! Control', which means it must also be a Begin. What it
4497 comes down to is that if we match Prepend* and then find no
4498 suitable Begin afterwards, that if we backtrack the last
4499 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4500 */
4501
7016d6eb 4502 if (NEXTCHR_IS_EOS)
a0ed51b3 4503 sayNO;
f2ed9b32 4504 if (! utf8_target) {
37e2e78e
KW
4505
4506 /* Match either CR LF or '.', as all the other possibilities
4507 * require utf8 */
4508 locinput++; /* Match the . or CR */
cc3b396d
KW
4509 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4510 match the LF */
37e2e78e 4511 && locinput < PL_regeol
e699a1d5
KW
4512 && UCHARAT(locinput) == '\n')
4513 {
4514 locinput++;
4515 }
37e2e78e
KW
4516 }
4517 else {
4518
4519 /* Utf8: See if is ( CR LF ); already know that locinput <
4520 * PL_regeol, so locinput+1 is in bounds */
7016d6eb 4521 if ( nextchr == '\r' && locinput+1 < PL_regeol
f67f9e53 4522 && UCHARAT(locinput + 1) == '\n')
7016d6eb 4523 {
37e2e78e
KW
4524 locinput += 2;
4525 }
4526 else {
45fdf108
KW
4527 STRLEN len;
4528
37e2e78e
KW
4529 /* In case have to backtrack to beginning, then match '.' */
4530 char *starting = locinput;
4531
4532 /* In case have to backtrack the last prepend */
e699a1d5 4533 char *previous_prepend = NULL;
37e2e78e
KW
4534
4535 LOAD_UTF8_CHARCLASS_GCB();
4536
45fdf108
KW
4537 /* Match (prepend)* */
4538 while (locinput < PL_regeol
4539 && (len = is_GCB_Prepend_utf8(locinput)))
4540 {
4541 previous_prepend = locinput;
4542 locinput += len;
a1853d78 4543 }
37e2e78e
KW
4544
4545 /* As noted above, if we matched a prepend character, but
4546 * the next thing won't match, back off the last prepend we
4547 * matched, as it is guaranteed to match the begin */
4548 if (previous_prepend
4549 && (locinput >= PL_regeol
c101f46d
KW
4550 || (! swash_fetch(PL_utf8_X_regular_begin,
4551 (U8*)locinput, utf8_target)
bff53399 4552 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
c101f46d 4553 )
37e2e78e
KW
4554 {
4555 locinput = previous_prepend;
4556 }
4557
4558 /* Note that here we know PL_regeol > locinput, as we
4559 * tested that upon input to this switch case, and if we
4560 * moved locinput forward, we tested the result just above
4561 * and it either passed, or we backed off so that it will
4562 * now pass */
11dfcd49
KW
4563 if (swash_fetch(PL_utf8_X_regular_begin,
4564 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4565 locinput += UTF8SKIP(locinput);
4566 }
bff53399 4567 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
37e2e78e
KW
4568
4569 /* Here did not match the required 'Begin' in the
4570 * second term. So just match the very first
4571 * character, the '.' of the final term of the regex */
4572 locinput = starting + UTF8SKIP(starting);
27d4fc33 4573 goto exit_utf8;
37e2e78e
KW
4574 } else {
4575
11dfcd49
KW
4576 /* Here is a special begin. It can be composed of
4577 * several individual characters. One possibility is
4578 * RI+ */
45fdf108
KW
4579 if ((len = is_GCB_RI_utf8(locinput))) {
4580 locinput += len;
11dfcd49 4581 while (locinput < PL_regeol
45fdf108 4582 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4583 {
45fdf108 4584 locinput += len;
11dfcd49 4585 }
45fdf108
KW
4586 } else if ((len = is_GCB_T_utf8(locinput))) {
4587 /* Another possibility is T+ */
4588 locinput += len;
11dfcd49 4589 while (locinput < PL_regeol
45fdf108 4590 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4591 {
45fdf108 4592 locinput += len;
11dfcd49
KW
4593 }
4594 } else {
4595
4596 /* Here, neither RI+ nor T+; must be some other
4597 * Hangul. That means it is one of the others: L,
4598 * LV, LVT or V, and matches:
4599 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4600
4601 /* Match L* */
4602 while (locinput < PL_regeol
45fdf108 4603 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4604 {
45fdf108 4605 locinput += len;
11dfcd49 4606 }
37e2e78e 4607
11dfcd49
KW
4608 /* Here, have exhausted L*. If the next character
4609 * is not an LV, LVT nor V, it means we had to have
4610 * at least one L, so matches L+ in the original
4611 * equation, we have a complete hangul syllable.
4612 * Are done. */
4613
4614 if (locinput < PL_regeol
45fdf108 4615 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49 4616 {
11dfcd49 4617 /* Otherwise keep going. Must be LV, LVT or V.
7d43c479
KW
4618 * See if LVT, by first ruling out V, then LV */
4619 if (! is_GCB_V_utf8(locinput)
4620 /* All but every TCount one is LV */
4621 && (valid_utf8_to_uvchr((U8 *) locinput,
4622 NULL)
4623 - SBASE)
4624 % TCount != 0)
4625 {
11dfcd49
KW
4626 locinput += UTF8SKIP(locinput);
4627 } else {
4628
4629 /* Must be V or LV. Take it, then match
4630 * V* */
4631 locinput += UTF8SKIP(locinput);
4632 while (locinput < PL_regeol
45fdf108 4633 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4634 {
45fdf108 4635 locinput += len;
11dfcd49
KW
4636 }
4637 }
37e2e78e 4638
11dfcd49 4639 /* And any of LV, LVT, or V can be followed
45fdf108 4640 * by T* */
11dfcd49 4641 while (locinput < PL_regeol
45fdf108 4642 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4643 {
45fdf108 4644 locinput += len;
11dfcd49
KW
4645 }
4646 }
cd94d768 4647 }
11dfcd49 4648 }
37e2e78e 4649
11dfcd49
KW
4650 /* Match any extender */
4651 while (locinput < PL_regeol
4652 && swash_fetch(PL_utf8_X_extend,
4653 (U8*)locinput, utf8_target))
4654 {
4655 locinput += UTF8SKIP(locinput);
4656 }
37e2e78e 4657 }
27d4fc33 4658 exit_utf8:
37e2e78e
KW
4659 if (locinput > PL_regeol) sayNO;
4660 }
a0ed51b3 4661 break;
81714fb9 4662
3c0563b9 4663 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
4664 { /* The capture buffer cases. The ones beginning with N for the
4665 named buffers just convert to the equivalent numbered and
4666 pretend they were called as the corresponding numbered buffer
4667 op. */
26ecd678
TC
4668 /* don't initialize these in the declaration, it makes C++
4669 unhappy */
81714fb9 4670 char *s;
ff1157ca 4671 char type;
8368298a
TC
4672 re_fold_t folder;
4673 const U8 *fold_array;
26ecd678 4674 UV utf8_fold_flags;
8368298a 4675
272d35c9 4676 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4677 folder = foldEQ_locale;
4678 fold_array = PL_fold_locale;
4679 type = REFFL;
17580e7a 4680 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4681 goto do_nref;
4682
3c0563b9 4683 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
4684 folder = foldEQ_latin1;
4685 fold_array = PL_fold_latin1;
4686 type = REFFA;
4687 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4688 goto do_nref;
4689
3c0563b9 4690 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
4691 folder = foldEQ_latin1;
4692 fold_array = PL_fold_latin1;
4693 type = REFFU;
d513472c 4694 utf8_fold_flags = 0;
d7ef4b73
KW
4695 goto do_nref;
4696
3c0563b9 4697 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
4698 folder = foldEQ;
4699 fold_array = PL_fold;
4700 type = REFF;
d513472c 4701 utf8_fold_flags = 0;
d7ef4b73
KW
4702 goto do_nref;
4703
3c0563b9 4704 case NREF: /* /\g{name}/ */
d7ef4b73 4705 type = REF;
83d7b90b
KW
4706 folder = NULL;
4707 fold_array = NULL;
d513472c 4708 utf8_fold_flags = 0;
d7ef4b73
KW
4709 do_nref:
4710
4711 /* For the named back references, find the corresponding buffer
4712 * number */
0a4db386
YO
4713 n = reg_check_named_buff_matched(rex,scan);
4714
d7ef4b73 4715 if ( ! n ) {
81714fb9 4716 sayNO;
d7ef4b73
KW
4717 }
4718 goto do_nref_ref_common;
4719
3c0563b9 4720 case REFFL: /* /\1/il */
272d35c9 4721 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4722 folder = foldEQ_locale;
4723 fold_array = PL_fold_locale;
17580e7a 4724 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4725 goto do_ref;
4726
3c0563b9 4727 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
4728 folder = foldEQ_latin1;
4729 fold_array = PL_fold_latin1;
4730 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4731 goto do_ref;
4732
3c0563b9 4733 case REFFU: /* /\1/iu */
d7ef4b73
KW
4734 folder = foldEQ_latin1;
4735 fold_array = PL_fold_latin1;
d513472c 4736 utf8_fold_flags = 0;
d7ef4b73
KW
4737 goto do_ref;
4738
3c0563b9 4739 case REFF: /* /\1/i */
d7ef4b73
KW
4740 folder = foldEQ;
4741 fold_array = PL_fold;
d513472c 4742 utf8_fold_flags = 0;
83d7b90b 4743 goto do_ref;
d7ef4b73 4744
3c0563b9 4745 case REF: /* /\1/ */
83d7b90b
KW
4746 folder = NULL;
4747 fold_array = NULL;
d513472c 4748 utf8_fold_flags = 0;
83d7b90b 4749
d7ef4b73 4750 do_ref:
81714fb9 4751 type = OP(scan);
d7ef4b73
KW
4752 n = ARG(scan); /* which paren pair */
4753
4754 do_nref_ref_common:
b93070ed 4755 ln = rex->offs[n].start;
2c2d71f5 4756 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4757 if (rex->lastparen < n || ln == -1)
af3f8c16 4758 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4759 if (ln == rex->offs[n].end)
a0d0e21e 4760 break;
a0ed51b3 4761
24d3c4a9 4762 s = PL_bostr + ln;
d7ef4b73 4763 if (type != REF /* REF can do byte comparison */
2f65c56d 4764 && (utf8_target || type == REFFU))
d7ef4b73
KW
4765 { /* XXX handle REFFL better */
4766 char * limit = PL_regeol;
4767
4768 /* This call case insensitively compares the entire buffer
4769 * at s, with the current input starting at locinput, but
4770 * not going off the end given by PL_regeol, and returns in
2db97d41 4771 * <limit> upon success, how much of the current input was
d7ef4b73 4772 * matched */
b93070ed 4773 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4774 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4775 {
4776 sayNO;
a0ed51b3 4777 }
d7ef4b73 4778 locinput = limit;
a0ed51b3
LW
4779 break;
4780 }
4781
d7ef4b73 4782 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
4783 if (!NEXTCHR_IS_EOS &&
4784 UCHARAT(s) != nextchr &&
81714fb9 4785 (type == REF ||
d7ef4b73 4786 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4787 sayNO;
b93070ed 4788 ln = rex->offs[n].end - ln;
24d3c4a9 4789 if (locinput + ln > PL_regeol)
4633a7c4 4790 sayNO;
81714fb9 4791 if (ln > 1 && (type == REF
24d3c4a9 4792 ? memNE(s, locinput, ln)
d7ef4b73 4793 : ! folder(s, locinput, ln)))
4633a7c4 4794 sayNO;
24d3c4a9 4795 locinput += ln;
a0d0e21e 4796 break;
81714fb9 4797 }
3c0563b9
DM
4798
4799 case NOTHING: /* null op; e.g. the 'nothing' following
4800 * the '*' in m{(a+|b)*}' */
4801 break;
4802 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 4803 break;
3c0563b9
DM
4804
4805 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 4806 break;
40a82448
DM
4807
4808#undef ST
4809#define ST st->u.eval
c277df42 4810 {
c277df42 4811 SV *ret;
d2f13c59 4812 REGEXP *re_sv;
6bda09f9 4813 regexp *re;
f8fc2ecf 4814 regexp_internal *rei;
1a147d38
YO
4815 regnode *startpoint;
4816
3c0563b9 4817 case GOSTART: /* (?R) */
e7707071
YO
4818 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4819 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4820 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4821 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4822 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4823 Perl_croak(aTHX_
4824 "Pattern subroutine nesting without pos change"
4825 " exceeded limit in regex");
6bda09f9
YO
4826 } else {
4827 nochange_depth = 0;
1a147d38 4828 }
288b8c02 4829 re_sv = rex_sv;
6bda09f9 4830 re = rex;
f8fc2ecf 4831 rei = rexi;
1a147d38 4832 if (OP(scan)==GOSUB) {
6bda09f9
YO
4833 startpoint = scan + ARG2L(scan);
4834 ST.close_paren = ARG(scan);
4835 } else {
f8fc2ecf 4836 startpoint = rei->program+1;
6bda09f9
YO
4837 ST.close_paren = 0;
4838 }
4839 goto eval_recurse_doit;
118e2215 4840 assert(0); /* NOTREACHED */
3c0563b9 4841
6bda09f9
YO
4842 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4843 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4844 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4845 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4846 } else {
4847 nochange_depth = 0;
4848 }
8e5e9ebe 4849 {
4aabdb9b 4850 /* execute the code in the {...} */
81ed78b2 4851
4aabdb9b 4852 dSP;
a6dc34f1 4853 IV before;
1f4d1a1e 4854 OP * const oop = PL_op;
4aabdb9b 4855 COP * const ocurcop = PL_curcop;
81ed78b2 4856 OP *nop;
d80618d2 4857 char *saved_regeol = PL_regeol;
91332126 4858 struct re_save_state saved_state;
81ed78b2 4859 CV *newcv;
91332126 4860
74088413 4861 /* save *all* paren positions */
92da3157 4862 regcppush(rex, 0, maxopenparen);
74088413
DM
4863 REGCP_SET(runops_cp);
4864
6562f1c4 4865 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4866 * eval we would normally put it on the save stack, like with
4867 * save_re_context. However, re-evals have a weird scoping so we
4868 * can't just add ENTER/LEAVE here. With that, things like
4869 *
4870 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4871 *
4872 * would break, as they expect the localisation to be unwound
4873 * only when the re-engine backtracks through the bit that
4874 * localised it.
4875 *
4876 * What we do instead is just saving the state in a local c
4877 * variable.
4878 */
91332126 4879 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4880
d24ca0c5 4881 PL_reg_state.re_reparsing = FALSE;
91332126 4882
81ed78b2
DM
4883 if (!caller_cv)
4884 caller_cv = find_runcv(NULL);
4885
4aabdb9b 4886 n = ARG(scan);
81ed78b2 4887
b30fcab9 4888 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 4889 newcv = (ReANY(
b30fcab9
DM
4890 (REGEXP*)(rexi->data->data[n])
4891 ))->qr_anoncv
81ed78b2
DM
4892 ;
4893 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4894 }
4895 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4896 newcv = caller_cv;
4897 nop = (OP*)rexi->data->data[n];
4898 assert(CvDEPTH(newcv));
68e2671b
DM
4899 }
4900 else {
d24ca0c5
DM
4901 /* literal with own CV */
4902 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4903 newcv = rex->qr_anoncv;
4904 nop = (OP*)rexi->data->data[n];
68e2671b 4905 }
81ed78b2 4906
0e458318
DM
4907 /* normally if we're about to execute code from the same
4908 * CV that we used previously, we just use the existing
4909 * CX stack entry. However, its possible that in the
4910 * meantime we may have backtracked, popped from the save
4911 * stack, and undone the SAVECOMPPAD(s) associated with
4912 * PUSH_MULTICALL; in which case PL_comppad no longer
4913 * points to newcv's pad. */
4914 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4915 {
4916 I32 depth = (newcv == caller_cv) ? 0 : 1;
4917 if (last_pushed_cv) {
4918 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4919 }
4920 else {
4921 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4922 }
4923 last_pushed_cv = newcv;
4924 }
c31ee3bb
DM
4925 else {
4926 /* these assignments are just to silence compiler
4927 * warnings */
4928 multicall_cop = NULL;
4929 newsp = NULL;
4930 }
0e458318
DM
4931 last_pad = PL_comppad;
4932
2e2e3f36
DM
4933 /* the initial nextstate you would normally execute
4934 * at the start of an eval (which would cause error
4935 * messages to come from the eval), may be optimised
4936 * away from the execution path in the regex code blocks;
4937 * so manually set PL_curcop to it initially */
4938 {
81ed78b2 4939 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4940 assert(o->op_type == OP_NULL);
4941 if (o->op_targ == OP_SCOPE) {
4942 o = cUNOPo->op_first;
4943 }
4944 else {
4945 assert(o->op_targ == OP_LEAVE);
4946 o = cUNOPo->op_first;
4947 assert(o->op_type == OP_ENTER);
4948 o = o->op_sibling;
4949 }
4950
4951 if (o->op_type != OP_STUB) {
4952 assert( o->op_type == OP_NEXTSTATE
4953 || o->op_type == OP_DBSTATE
4954 || (o->op_type == OP_NULL
4955 && ( o->op_targ == OP_NEXTSTATE
4956 || o->op_targ == OP_DBSTATE
4957 )
4958 )
4959 );
4960 PL_curcop = (COP*)o;
4961 }
4962 }
81ed78b2 4963 nop = nop->op_next;
2e2e3f36 4964
24b23f37 4965 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4966 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4967
b93070ed 4968 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4969
2bf803e2
YO
4970 if (sv_yes_mark) {
4971 SV *sv_mrk = get_sv("REGMARK", 1);
4972 sv_setsv(sv_mrk, sv_yes_mark);
4973 }
4974
81ed78b2
DM
4975 /* we don't use MULTICALL here as we want to call the
4976 * first op of the block of interest, rather than the
4977 * first op of the sub */
a6dc34f1 4978 before = (IV)(SP-PL_stack_base);
81ed78b2 4979 PL_op = nop;
8e5e9ebe
RGS
4980 CALLRUNOPS(aTHX); /* Scalar context. */
4981 SPAGAIN;
a6dc34f1 4982 if ((IV)(SP-PL_stack_base) == before)
075aa684 4983 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4984 else {
4985 ret = POPs;
4986 PUTBACK;
4987 }
4aabdb9b 4988
e4bfbed3
DM
4989 /* before restoring everything, evaluate the returned
4990 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4991 * PL_op or pad. Also need to process any magic vars
4992 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4993
4994 PL_op = NULL;
4995
5e98dac2 4996 re_sv = NULL;
e4bfbed3
DM
4997 if (logical == 0) /* (?{})/ */
4998 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4999 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5000 sw = cBOOL(SvTRUE(ret));
5001 logical = 0;
5002 }
5003 else { /* /(??{}) */
497d0a96
DM
5004 /* if its overloaded, let the regex compiler handle
5005 * it; otherwise extract regex, or stringify */
5006 if (!SvAMAGIC(ret)) {
5007 SV *sv = ret;
5008 if (SvROK(sv))
5009 sv = SvRV(sv);
5010 if (SvTYPE(sv) == SVt_REGEXP)
5011 re_sv = (REGEXP*) sv;
5012 else if (SvSMAGICAL(sv)) {
5013 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5014 if (mg)
5015 re_sv = (REGEXP *) mg->mg_obj;
5016 }
e4bfbed3 5017
497d0a96
DM
5018 /* force any magic, undef warnings here */
5019 if (!re_sv) {
5020 ret = sv_mortalcopy(ret);
5021 (void) SvPV_force_nolen(ret);
5022 }
e4bfbed3
DM
5023 }
5024
5025 }
5026
91332126
FR
5027 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5028
81ed78b2
DM
5029 /* *** Note that at this point we don't restore
5030 * PL_comppad, (or pop the CxSUB) on the assumption it may
5031 * be used again soon. This is safe as long as nothing
5032 * in the regexp code uses the pad ! */
4aabdb9b 5033 PL_op = oop;
4aabdb9b 5034 PL_curcop = ocurcop;
d80618d2 5035 PL_regeol = saved_regeol;
92da3157 5036 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
e4bfbed3
DM
5037
5038 if (logical != 2)
4aabdb9b 5039 break;
8e5e9ebe 5040 }
e4bfbed3
DM
5041
5042 /* only /(??{})/ from now on */
24d3c4a9 5043 logical = 0;
4aabdb9b 5044 {
4f639d21
DM
5045 /* extract RE object from returned value; compiling if
5046 * necessary */
5c35adbb 5047
575c37f6
DM
5048 if (re_sv) {
5049 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 5050 }
0f5d15d6 5051 else {
c737faaf 5052 U32 pm_flags = 0;
0f5d15d6 5053
9753d940
DM
5054 if (SvUTF8(ret) && IN_BYTES) {
5055 /* In use 'bytes': make a copy of the octet
5056 * sequence, but without the flag on */
b9ad30b4
NC
5057 STRLEN len;
5058 const char *const p = SvPV(ret, len);
5059 ret = newSVpvn_flags(p, len, SVs_TEMP);
5060 }
732caac7
DM
5061 if (rex->intflags & PREGf_USE_RE_EVAL)
5062 pm_flags |= PMf_USE_RE_EVAL;
5063
5064 /* if we got here, it should be an engine which
5065 * supports compiling code blocks and stuff */
5066 assert(rex->engine && rex->engine->op_comp);
ec841a27 5067 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 5068 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
5069 rex->engine, NULL, NULL,
5070 /* copy /msix etc to inner pattern */
5071 scan->flags,
5072 pm_flags);
732caac7 5073
9041c2e3 5074 if (!(SvFLAGS(ret)
faf82a0b 5075 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 5076 | SVs_GMG))) {
a2794585
NC
5077 /* This isn't a first class regexp. Instead, it's
5078 caching a regexp onto an existing, Perl visible
5079 scalar. */
575c37f6 5080 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 5081 }
74088413
DM
5082 /* safe to do now that any $1 etc has been
5083 * interpolated into the new pattern string and
5084 * compiled */
92da3157 5085 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
0f5d15d6 5086 }
e1ff3a88 5087 SAVEFREESV(re_sv);
8d919b0a 5088 re = ReANY(re_sv);
4aabdb9b 5089 }
07bc277f 5090 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
5091 re->subbeg = rex->subbeg;
5092 re->sublen = rex->sublen;
6502e081
DM
5093 re->suboffset = rex->suboffset;
5094 re->subcoffset = rex->subcoffset;
f8fc2ecf 5095 rei = RXi_GET(re);
6bda09f9 5096 DEBUG_EXECUTE_r(
f2ed9b32 5097 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
5098 "Matching embedded");
5099 );
f8fc2ecf 5100 startpoint = rei->program + 1;
1a147d38 5101 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 5102
1a147d38 5103 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 5104 /* run the pattern returned from (??{...}) */
92da3157
DM
5105
5106 /* Save *all* the positions. */
5107 ST.cp = regcppush(rex, 0, maxopenparen);
40a82448 5108 REGCP_SET(ST.lastcp);
6bda09f9 5109
0357f1fd
ML
5110 re->lastparen = 0;
5111 re->lastcloseparen = 0;
5112
92da3157 5113 maxopenparen = 0;
4aabdb9b
DM
5114
5115 /* XXXX This is too dramatic a measure... */
5116 PL_reg_maxiter = 0;
5117
984e6dd1
DM
5118 ST.saved_utf8_pat = is_utf8_pat;
5119 is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 5120
288b8c02 5121 ST.prev_rex = rex_sv;
faec1544 5122 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5123 rex_sv = re_sv;
5124 SET_reg_curpm(rex_sv);
288b8c02 5125 rex = re;
f8fc2ecf 5126 rexi = rei;
faec1544 5127 cur_curlyx = NULL;
40a82448 5128 ST.B = next;
faec1544
DM
5129 ST.prev_eval = cur_eval;
5130 cur_eval = st;
faec1544 5131 /* now continue from first node in postoned RE */
4d5016e5 5132 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 5133 assert(0); /* NOTREACHED */
c277df42 5134 }
40a82448 5135
faec1544
DM
5136 case EVAL_AB: /* cleanup after a successful (??{A})B */
5137 /* note: this is called twice; first after popping B, then A */
984e6dd1 5138 is_utf8_pat = ST.saved_utf8_pat;
ec43f78b
DM
5139 rex_sv = ST.prev_rex;
5140 SET_reg_curpm(rex_sv);
8d919b0a 5141 rex = ReANY(rex_sv);
f8fc2ecf 5142 rexi = RXi_GET(rex);
faec1544
DM
5143 regcpblow(ST.cp);
5144 cur_eval = ST.prev_eval;
5145 cur_curlyx = ST.prev_curlyx;
34a81e2b 5146
40a82448
DM
5147 /* XXXX This is too dramatic a measure... */
5148 PL_reg_maxiter = 0;
e7707071 5149 if ( nochange_depth )
4b196cd4 5150 nochange_depth--;
262b90c4 5151 sayYES;
40a82448 5152
40a82448 5153
faec1544
DM
5154 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5155 /* note: this is called twice; first after popping B, then A */
984e6dd1 5156 is_utf8_pat = ST.saved_utf8_pat;
ec43f78b
DM
5157 rex_sv = ST.prev_rex;
5158 SET_reg_curpm(rex_sv);
8d919b0a 5159 rex = ReANY(rex_sv);
f8fc2ecf 5160 rexi = RXi_GET(rex);
0357f1fd 5161
40a82448 5162 REGCP_UNWIND(ST.lastcp);
92da3157 5163 regcppop(rex, &maxopenparen);
faec1544
DM
5164 cur_eval = ST.prev_eval;
5165 cur_curlyx = ST.prev_curlyx;
5166 /* XXXX This is too dramatic a measure... */
5167 PL_reg_maxiter = 0;
e7707071 5168 if ( nochange_depth )
4b196cd4 5169 nochange_depth--;
40a82448 5170 sayNO_SILENT;
40a82448
DM
5171#undef ST
5172
3c0563b9 5173 case OPEN: /* ( */
c277df42 5174 n = ARG(scan); /* which paren pair */
1ca2007e 5175 rex->offs[n].start_tmp = locinput - PL_bostr;
92da3157
DM
5176 if (n > maxopenparen)
5177 maxopenparen = n;
495f47a5 5178 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 5179 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
5180 PTR2UV(rex),
5181 PTR2UV(rex->offs),
5182 (UV)n,
5183 (IV)rex->offs[n].start_tmp,
92da3157 5184 (UV)maxopenparen
495f47a5 5185 ));
e2e6a0f1 5186 lastopen = n;
a0d0e21e 5187 break;
495f47a5
DM
5188
5189/* XXX really need to log other places start/end are set too */
5190#define CLOSE_CAPTURE \
5191 rex->offs[n].start = rex->offs[n].start_tmp; \
5192 rex->offs[n].end = locinput - PL_bostr; \
5193 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5194 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5195 PTR2UV(rex), \
5196 PTR2UV(rex->offs), \
5197 (UV)n, \
5198 (IV)rex->offs[n].start, \
5199 (IV)rex->offs[n].end \
5200 ))
5201
3c0563b9 5202 case CLOSE: /* ) */
c277df42 5203 n = ARG(scan); /* which paren pair */
495f47a5 5204 CLOSE_CAPTURE;
b93070ed
DM
5205 if (n > rex->lastparen)
5206 rex->lastparen = n;
5207 rex->lastcloseparen = n;
3b6647e0 5208 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5209 goto fake_end;
5210 }
a0d0e21e 5211 break;
3c0563b9
DM
5212
5213 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5214 if (ARG(scan)){
5215 regnode *cursor;
5216 for (cursor=scan;
5217 cursor && OP(cursor)!=END;
5218 cursor=regnext(cursor))
5219 {
5220 if ( OP(cursor)==CLOSE ){
5221 n = ARG(cursor);
5222 if ( n <= lastopen ) {
495f47a5 5223 CLOSE_CAPTURE;
b93070ed
DM
5224 if (n > rex->lastparen)
5225 rex->lastparen = n;
5226 rex->lastcloseparen = n;
3b6647e0
RB
5227 if ( n == ARG(scan) || (cur_eval &&
5228 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5229 break;
5230 }
5231 }
5232 }
5233 }
5234 goto fake_end;
5235 /*NOTREACHED*/
3c0563b9
DM
5236
5237 case GROUPP: /* (?(1)) */
c277df42 5238 n = ARG(scan); /* which paren pair */
b93070ed 5239 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5240 break;
3c0563b9
DM
5241
5242 case NGROUPP: /* (?(<name>)) */
0a4db386 5243 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5244 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5245 break;
3c0563b9
DM
5246
5247 case INSUBP: /* (?(R)) */
0a4db386 5248 n = ARG(scan);
3b6647e0 5249 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5250 break;
3c0563b9
DM
5251
5252 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5253 sw = 0;
5254 break;
3c0563b9
DM
5255
5256 case IFTHEN: /* (?(cond)A|B) */
2c2d71f5 5257 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 5258 if (sw)
c277df42
IZ
5259 next = NEXTOPER(NEXTOPER(scan));
5260 else {
5261 next = scan + ARG(scan);
5262 if (OP(next) == IFTHEN) /* Fake one. */
5263 next = NEXTOPER(NEXTOPER(next));
5264 }
5265 break;
3c0563b9
DM
5266
5267 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5268 logical = scan->flags;
c277df42 5269 break;
c476f425 5270
2ab05381 5271/*******************************************************************
2ab05381 5272
c476f425
DM
5273The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5274pattern, where A and B are subpatterns. (For simple A, CURLYM or
5275STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5276
c476f425 5277A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5278
c476f425
DM
5279On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5280state, which contains the current count, initialised to -1. It also sets
5281cur_curlyx to point to this state, with any previous value saved in the
5282state block.
2ab05381 5283
c476f425
DM
5284CURLYX then jumps straight to the WHILEM op, rather than executing A,
5285since the pattern may possibly match zero times (i.e. it's a while {} loop
5286rather than a do {} while loop).
2ab05381 5287
c476f425
DM
5288Each entry to WHILEM represents a successful match of A. The count in the
5289CURLYX block is incremented, another WHILEM state is pushed, and execution
5290passes to A or B depending on greediness and the current count.
2ab05381 5291
c476f425
DM
5292For example, if matching against the string a1a2a3b (where the aN are
5293substrings that match /A/), then the match progresses as follows: (the
5294pushed states are interspersed with the bits of strings matched so far):
2ab05381 5295
c476f425
DM
5296 <CURLYX cnt=-1>
5297 <CURLYX cnt=0><WHILEM>
5298 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5299 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5300 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5301 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5302
c476f425
DM
5303(Contrast this with something like CURLYM, which maintains only a single
5304backtrack state:
2ab05381 5305
c476f425
DM
5306 <CURLYM cnt=0> a1
5307 a1 <CURLYM cnt=1> a2
5308 a1 a2 <CURLYM cnt=2> a3
5309 a1 a2 a3 <CURLYM cnt=3> b
5310)
2ab05381 5311
c476f425
DM
5312Each WHILEM state block marks a point to backtrack to upon partial failure
5313of A or B, and also contains some minor state data related to that
5314iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5315overall state, such as the count, and pointers to the A and B ops.
2ab05381 5316
c476f425
DM
5317This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5318must always point to the *current* CURLYX block, the rules are:
2ab05381 5319
c476f425
DM
5320When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5321and set cur_curlyx to point the new block.
2ab05381 5322
c476f425
DM
5323When popping the CURLYX block after a successful or unsuccessful match,
5324restore the previous cur_curlyx.
2ab05381 5325
c476f425
DM
5326When WHILEM is about to execute B, save the current cur_curlyx, and set it
5327to the outer one saved in the CURLYX block.
2ab05381 5328
c476f425
DM
5329When popping the WHILEM block after a successful or unsuccessful B match,
5330restore the previous cur_curlyx.
2ab05381 5331
c476f425
DM
5332Here's an example for the pattern (AI* BI)*BO
5333I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5334
c476f425
DM
5335cur_
5336curlyx backtrack stack
5337------ ---------------
5338NULL
5339CO <CO prev=NULL> <WO>
5340CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5341CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5342NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5343
c476f425
DM
5344At this point the pattern succeeds, and we work back down the stack to
5345clean up, restoring as we go:
95b24440 5346
c476f425
DM
5347CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5348CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5349CO <CO prev=NULL> <WO>
5350NULL
a0374537 5351
c476f425
DM
5352*******************************************************************/
5353
5354#define ST st->u.curlyx
5355
5356 case CURLYX: /* start of /A*B/ (for complex A) */
5357 {
5358 /* No need to save/restore up to this paren */
5359 I32 parenfloor = scan->flags;
5360
5361 assert(next); /* keep Coverity happy */
5362 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5363 next += ARG(next);
5364
5365 /* XXXX Probably it is better to teach regpush to support
92da3157 5366 parenfloor > maxopenparen ... */
b93070ed
DM
5367 if (parenfloor > (I32)rex->lastparen)
5368 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5369
5370 ST.prev_curlyx= cur_curlyx;
5371 cur_curlyx = st;
5372 ST.cp = PL_savestack_ix;
5373
5374 /* these fields contain the state of the current curly.
5375 * they are accessed by subsequent WHILEMs */
5376 ST.parenfloor = parenfloor;
d02d6d97 5377 ST.me = scan;
c476f425 5378 ST.B = next;
24d3c4a9
DM
5379 ST.minmod = minmod;
5380 minmod = 0;
c476f425
DM
5381 ST.count = -1; /* this will be updated by WHILEM */
5382 ST.lastloc = NULL; /* this will be updated by WHILEM */
5383
4d5016e5 5384 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5385 assert(0); /* NOTREACHED */
c476f425 5386 }
a0d0e21e 5387
c476f425 5388 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5389 cur_curlyx = ST.prev_curlyx;
5390 sayYES;
118e2215 5391 assert(0); /* NOTREACHED */
a0d0e21e 5392
c476f425
DM
5393 case CURLYX_end_fail: /* just failed to match all of A*B */
5394 regcpblow(ST.cp);
5395 cur_curlyx = ST.prev_curlyx;
5396 sayNO;
118e2215 5397 assert(0); /* NOTREACHED */
4633a7c4 5398
a0d0e21e 5399
c476f425
DM
5400#undef ST
5401#define ST st->u.whilem
5402
5403 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5404 {
5405 /* see the discussion above about CURLYX/WHILEM */
c476f425 5406 I32 n;
d02d6d97
DM
5407 int min = ARG1(cur_curlyx->u.curlyx.me);
5408 int max = ARG2(cur_curlyx->u.curlyx.me);
5409 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5410
c476f425
DM
5411 assert(cur_curlyx); /* keep Coverity happy */
5412 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5413 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5414 ST.cache_offset = 0;
5415 ST.cache_mask = 0;
5416
c476f425
DM
5417
5418 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5419 "%*s whilem: matched %ld out of %d..%d\n",
5420 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5421 );
a0d0e21e 5422
c476f425 5423 /* First just match a string of min A's. */
a0d0e21e 5424
d02d6d97 5425 if (n < min) {
92da3157
DM
5426 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5427 maxopenparen);
c476f425 5428 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5429 REGCP_SET(ST.lastcp);
5430
4d5016e5 5431 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5432 assert(0); /* NOTREACHED */
c476f425
DM
5433 }
5434
5435 /* If degenerate A matches "", assume A done. */
5436
5437 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5438 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5439 "%*s whilem: empty match detected, trying continuation...\n",
5440 REPORT_CODE_OFF+depth*2, "")
5441 );
5442 goto do_whilem_B_max;
5443 }
5444
5445 /* super-linear cache processing */
5446
5447 if (scan->flags) {
a0d0e21e 5448
2c2d71f5 5449 if (!PL_reg_maxiter) {
c476f425
DM
5450 /* start the countdown: Postpone detection until we
5451 * know the match is not *that* much linear. */
2c2d71f5 5452 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
5453 /* possible overflow for long strings and many CURLYX's */
5454 if (PL_reg_maxiter < 0)
5455 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
5456 PL_reg_leftiter = PL_reg_maxiter;
5457 }
c476f425 5458
2c2d71f5 5459 if (PL_reg_leftiter-- == 0) {
c476f425 5460 /* initialise cache */
3298f257 5461 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 5462 if (PL_reg_poscache) {
eb160463 5463 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
5464 Renew(PL_reg_poscache, size, char);
5465 PL_reg_poscache_size = size;
5466 }
5467 Zero(PL_reg_poscache, size, char);
5468 }
5469 else {
5470 PL_reg_poscache_size = size;
a02a5408 5471 Newxz(PL_reg_poscache, size, char);
2c2d71f5 5472 }
c476f425
DM
5473 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5474 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5475 PL_colors[4], PL_colors[5])
5476 );
2c2d71f5 5477 }
c476f425 5478
2c2d71f5 5479 if (PL_reg_leftiter < 0) {
c476f425
DM
5480 /* have we already failed at this position? */
5481 I32 offset, mask;
5482 offset = (scan->flags & 0xf) - 1
5483 + (locinput - PL_bostr) * (scan->flags>>4);
5484 mask = 1 << (offset % 8);
5485 offset /= 8;
5486 if (PL_reg_poscache[offset] & mask) {
5487 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5488 "%*s whilem: (cache) already tried at this position...\n",
5489 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5490 );
3298f257 5491 sayNO; /* cache records failure */
2c2d71f5 5492 }
c476f425
DM
5493 ST.cache_offset = offset;
5494 ST.cache_mask = mask;
2c2d71f5 5495 }
c476f425 5496 }
2c2d71f5 5497
c476f425 5498 /* Prefer B over A for minimal matching. */
a687059c 5499
c476f425
DM
5500 if (cur_curlyx->u.curlyx.minmod) {
5501 ST.save_curlyx = cur_curlyx;
5502 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
5503 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5504 maxopenparen);
c476f425 5505 REGCP_SET(ST.lastcp);
4d5016e5
DM
5506 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5507 locinput);
118e2215 5508 assert(0); /* NOTREACHED */
c476f425 5509 }
a0d0e21e 5510
c476f425
DM
5511 /* Prefer A over B for maximal matching. */
5512
d02d6d97 5513 if (n < max) { /* More greed allowed? */
92da3157
DM
5514 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5515 maxopenparen);
c476f425
DM
5516 cur_curlyx->u.curlyx.lastloc = locinput;
5517 REGCP_SET(ST.lastcp);
4d5016e5 5518 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5519 assert(0); /* NOTREACHED */
c476f425
DM
5520 }
5521 goto do_whilem_B_max;
5522 }
118e2215 5523 assert(0); /* NOTREACHED */
c476f425
DM
5524
5525 case WHILEM_B_min: /* just matched B in a minimal match */
5526 case WHILEM_B_max: /* just matched B in a maximal match */
5527 cur_curlyx = ST.save_curlyx;
5528 sayYES;
118e2215 5529 assert(0); /* NOTREACHED */
c476f425
DM
5530
5531 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5532 cur_curlyx = ST.save_curlyx;
5533 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5534 cur_curlyx->u.curlyx.count--;
5535 CACHEsayNO;
118e2215 5536 assert(0); /* NOTREACHED */
c476f425
DM
5537
5538 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5539 /* FALL THROUGH */
5540 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 5541 REGCP_UNWIND(ST.lastcp);
92da3157 5542 regcppop(rex, &maxopenparen);
c476f425
DM
5543 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5544 cur_curlyx->u.curlyx.count--;
5545 CACHEsayNO;
118e2215 5546 assert(0); /* NOTREACHED */
c476f425
DM
5547
5548 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5549 REGCP_UNWIND(ST.lastcp);
92da3157 5550 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
5551 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5552 "%*s whilem: failed, trying continuation...\n",
5553 REPORT_CODE_OFF+depth*2, "")
5554 );
5555 do_whilem_B_max:
5556 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5557 && ckWARN(WARN_REGEXP)
39819bd9 5558 && !reginfo->warned)
c476f425 5559 {
39819bd9 5560 reginfo->warned = TRUE;
dcbac5bb
FC
5561 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5562 "Complex regular subexpression recursion limit (%d) "
5563 "exceeded",
c476f425
DM
5564 REG_INFTY - 1);
5565 }
5566
5567 /* now try B */
5568 ST.save_curlyx = cur_curlyx;
5569 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5570 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5571 locinput);
118e2215 5572 assert(0); /* NOTREACHED */
c476f425
DM
5573
5574 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5575 cur_curlyx = ST.save_curlyx;
5576 REGCP_UNWIND(ST.lastcp);
92da3157 5577 regcppop(rex, &maxopenparen);
c476f425 5578
d02d6d97 5579 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5580 /* Maximum greed exceeded */
5581 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5582 && ckWARN(WARN_REGEXP)
39819bd9 5583 && !reginfo->warned)
c476f425 5584 {
39819bd9 5585 reginfo->warned = TRUE;
c476f425 5586 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5587 "Complex regular subexpression recursion "
5588 "limit (%d) exceeded",
c476f425 5589 REG_INFTY - 1);
a0d0e21e 5590 }
c476f425 5591 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5592 CACHEsayNO;
a0d0e21e 5593 }
c476f425
DM
5594
5595 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5596 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5597 );
5598 /* Try grabbing another A and see if it helps. */
c476f425 5599 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
5600 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5601 maxopenparen);
c476f425 5602 REGCP_SET(ST.lastcp);
d02d6d97 5603 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5604 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5605 locinput);
118e2215 5606 assert(0); /* NOTREACHED */
40a82448
DM
5607
5608#undef ST
5609#define ST st->u.branch
5610
5611 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5612 next = scan + ARG(scan);
5613 if (next == scan)
5614 next = NULL;
40a82448
DM
5615 scan = NEXTOPER(scan);
5616 /* FALL THROUGH */
c277df42 5617
40a82448
DM
5618 case BRANCH: /* /(...|A|...)/ */
5619 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5620 ST.lastparen = rex->lastparen;
f6033a9d 5621 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5622 ST.next_branch = next;
5623 REGCP_SET(ST.cp);
02db2b7b 5624
40a82448 5625 /* Now go into the branch */
5d458dd8 5626 if (has_cutgroup) {
4d5016e5 5627 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5628 } else {
4d5016e5 5629 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5630 }
118e2215 5631 assert(0); /* NOTREACHED */
3c0563b9
DM
5632
5633 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 5634 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5635 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5636 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5637 assert(0); /* NOTREACHED */
3c0563b9 5638
5d458dd8
YO
5639 case CUTGROUP_next_fail:
5640 do_cutgroup = 1;
5641 no_final = 1;
5642 if (st->u.mark.mark_name)
5643 sv_commit = st->u.mark.mark_name;
5644 sayNO;
118e2215 5645 assert(0); /* NOTREACHED */
3c0563b9 5646
5d458dd8
YO
5647 case BRANCH_next:
5648 sayYES;
118e2215 5649 assert(0); /* NOTREACHED */
3c0563b9 5650
40a82448 5651 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5652 if (do_cutgroup) {
5653 do_cutgroup = 0;
5654 no_final = 0;
5655 }
40a82448 5656 REGCP_UNWIND(ST.cp);
a8d1f4b4 5657 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5658 scan = ST.next_branch;
5659 /* no more branches? */
5d458dd8
YO
5660 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5661 DEBUG_EXECUTE_r({
5662 PerlIO_printf( Perl_debug_log,
5663 "%*s %sBRANCH failed...%s\n",
5664 REPORT_CODE_OFF+depth*2, "",
5665 PL_colors[4],
5666 PL_colors[5] );
5667 });
5668 sayNO_SILENT;
5669 }
40a82448 5670 continue; /* execute next BRANCH[J] op */
118e2215 5671 assert(0); /* NOTREACHED */
40a82448 5672
3c0563b9 5673 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 5674 minmod = 1;
a0d0e21e 5675 break;
40a82448
DM
5676
5677#undef ST
5678#define ST st->u.curlym
5679
5680 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5681
5682 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5683 * only a single backtracking state, no matter how many matches
40a82448
DM
5684 * there are in {m,n}. It relies on the pattern being constant
5685 * length, with no parens to influence future backrefs
5686 */
5687
5688 ST.me = scan;
dc45a647 5689 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5690
f6033a9d
DM
5691 ST.lastparen = rex->lastparen;
5692 ST.lastcloseparen = rex->lastcloseparen;
5693
40a82448
DM
5694 /* if paren positive, emulate an OPEN/CLOSE around A */
5695 if (ST.me->flags) {
3b6647e0 5696 U32 paren = ST.me->flags;
92da3157
DM
5697 if (paren > maxopenparen)
5698 maxopenparen = paren;
c277df42 5699 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5700 }
40a82448
DM
5701 ST.A = scan;
5702 ST.B = next;
5703 ST.alen = 0;
5704 ST.count = 0;
24d3c4a9
DM
5705 ST.minmod = minmod;
5706 minmod = 0;
40a82448
DM
5707 ST.c1 = CHRTEST_UNINIT;
5708 REGCP_SET(ST.cp);
6407bf3b 5709
40a82448
DM
5710 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5711 goto curlym_do_B;
5712
5713 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5714 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5715 assert(0); /* NOTREACHED */
5f80c4cf 5716
40a82448 5717 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5718 ST.count++;
5719 /* after first match, determine A's length: u.curlym.alen */
5720 if (ST.count == 1) {
5721 if (PL_reg_match_utf8) {
c07e9d7b
DM
5722 char *s = st->locinput;
5723 while (s < locinput) {
40a82448
DM
5724 ST.alen++;
5725 s += UTF8SKIP(s);
5726 }
5727 }
5728 else {
c07e9d7b 5729 ST.alen = locinput - st->locinput;
40a82448
DM
5730 }
5731 if (ST.alen == 0)
5732 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5733 }
0cadcf80
DM
5734 DEBUG_EXECUTE_r(
5735 PerlIO_printf(Perl_debug_log,
40a82448 5736 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5737 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5738 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5739 );
5740
0a4db386 5741 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5742 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5743 goto fake_end;
5744
c966426a
DM
5745 {
5746 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5747 if ( max == REG_INFTY || ST.count < max )
5748 goto curlym_do_A; /* try to match another A */
5749 }
40a82448 5750 goto curlym_do_B; /* try to match B */
5f80c4cf 5751
40a82448
DM
5752 case CURLYM_A_fail: /* just failed to match an A */
5753 REGCP_UNWIND(ST.cp);
0a4db386
YO
5754
5755 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5756 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5757 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5758 sayNO;
0cadcf80 5759
40a82448 5760 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5761 if (ST.c1 == CHRTEST_UNINIT) {
5762 /* calculate c1 and c2 for possible match of 1st char
5763 * following curly */
5764 ST.c1 = ST.c2 = CHRTEST_VOID;
5765 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5766 regnode *text_node = ST.B;
5767 if (! HAS_TEXT(text_node))
5768 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5769 /* this used to be
5770
5771 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5772
5773 But the former is redundant in light of the latter.
5774
5775 if this changes back then the macro for
5776 IS_TEXT and friends need to change.
5777 */
c74f6de9 5778 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 5779 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1
DM
5780 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5781 is_utf8_pat))
c74f6de9
KW
5782 {
5783 sayNO;
5784 }
c277df42 5785 }
c277df42 5786 }
40a82448
DM
5787 }
5788
5789 DEBUG_EXECUTE_r(
5790 PerlIO_printf(Perl_debug_log,
5791 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5792 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5793 "", (IV)ST.count)
5794 );
c74f6de9 5795 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
5796 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5797 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5798 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5799 {
5800 /* simulate B failing */
5801 DEBUG_OPTIMISE_r(
5802 PerlIO_printf(Perl_debug_log,
5803 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5804 (int)(REPORT_CODE_OFF+(depth*2)),"",
5805 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5806 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5807 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5808 );
5809 state_num = CURLYM_B_fail;
5810 goto reenter_switch;
5811 }
5812 }
5813 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
5814 /* simulate B failing */
5815 DEBUG_OPTIMISE_r(
5816 PerlIO_printf(Perl_debug_log,
79a2a0e8 5817 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5400f398 5818 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
5819 (int) nextchr, ST.c1, ST.c2)
5820 );
5400f398
KW
5821 state_num = CURLYM_B_fail;
5822 goto reenter_switch;
5823 }
c74f6de9 5824 }
40a82448
DM
5825
5826 if (ST.me->flags) {
f6033a9d 5827 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5828 I32 paren = ST.me->flags;
5829 if (ST.count) {
b93070ed 5830 rex->offs[paren].start
c07e9d7b
DM
5831 = HOPc(locinput, -ST.alen) - PL_bostr;
5832 rex->offs[paren].end = locinput - PL_bostr;
f6033a9d
DM
5833 if ((U32)paren > rex->lastparen)
5834 rex->lastparen = paren;
5835 rex->lastcloseparen = paren;
c277df42 5836 }
40a82448 5837 else
b93070ed 5838 rex->offs[paren].end = -1;
0a4db386 5839 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5840 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5841 {
5842 if (ST.count)
5843 goto fake_end;
5844 else
5845 sayNO;
5846 }
c277df42 5847 }
0a4db386 5848
4d5016e5 5849 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 5850 assert(0); /* NOTREACHED */
40a82448
DM
5851
5852 case CURLYM_B_fail: /* just failed to match a B */
5853 REGCP_UNWIND(ST.cp);
a8d1f4b4 5854 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5855 if (ST.minmod) {
84d2fa14
HS
5856 I32 max = ARG2(ST.me);
5857 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5858 sayNO;
5859 goto curlym_do_A; /* try to match a further A */
5860 }
5861 /* backtrack one A */
5862 if (ST.count == ARG1(ST.me) /* min */)
5863 sayNO;
5864 ST.count--;
7016d6eb 5865 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
5866 goto curlym_do_B; /* try to match B */
5867
c255a977
DM
5868#undef ST
5869#define ST st->u.curly
40a82448 5870
c255a977
DM
5871#define CURLY_SETPAREN(paren, success) \
5872 if (paren) { \
5873 if (success) { \
b93070ed
DM
5874 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5875 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5876 if (paren > rex->lastparen) \
5877 rex->lastparen = paren; \
b93070ed 5878 rex->lastcloseparen = paren; \
c255a977 5879 } \
f6033a9d 5880 else { \
b93070ed 5881 rex->offs[paren].end = -1; \
f6033a9d
DM
5882 rex->lastparen = ST.lastparen; \
5883 rex->lastcloseparen = ST.lastcloseparen; \
5884 } \
c255a977
DM
5885 }
5886
b40a2c17 5887 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
5888 ST.paren = 0;
5889 ST.min = 0;
5890 ST.max = REG_INFTY;
a0d0e21e
LW
5891 scan = NEXTOPER(scan);
5892 goto repeat;
3c0563b9 5893
b40a2c17 5894 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
5895 ST.paren = 0;
5896 ST.min = 1;
5897 ST.max = REG_INFTY;
c277df42 5898 scan = NEXTOPER(scan);
c255a977 5899 goto repeat;
3c0563b9 5900
b40a2c17 5901 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
5902 ST.paren = scan->flags; /* Which paren to set */
5903 ST.lastparen = rex->lastparen;
f6033a9d 5904 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
5905 if (ST.paren > maxopenparen)
5906 maxopenparen = ST.paren;
c255a977
DM
5907 ST.min = ARG1(scan); /* min to match */
5908 ST.max = ARG2(scan); /* max to match */
0a4db386 5909 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5910 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5911 ST.min=1;
5912 ST.max=1;
5913 }
c255a977
DM
5914 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5915 goto repeat;
3c0563b9 5916
b40a2c17 5917 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
5918 ST.paren = 0;
5919 ST.min = ARG1(scan); /* min to match */
5920 ST.max = ARG2(scan); /* max to match */
5921 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5922 repeat:
a0d0e21e
LW
5923 /*
5924 * Lookahead to avoid useless match attempts
5925 * when we know what character comes next.
c255a977 5926 *
5f80c4cf
JP
5927 * Used to only do .*x and .*?x, but now it allows
5928 * for )'s, ('s and (?{ ... })'s to be in the way
5929 * of the quantifier and the EXACT-like node. -- japhy
5930 */
5931
eb5c1be8 5932 assert(ST.min <= ST.max);
3337dfe3
KW
5933 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5934 ST.c1 = ST.c2 = CHRTEST_VOID;
5935 }
5936 else {
5f80c4cf
JP
5937 regnode *text_node = next;
5938
3dab1dad
YO
5939 if (! HAS_TEXT(text_node))
5940 FIND_NEXT_IMPT(text_node);
5f80c4cf 5941
9e137952 5942 if (! HAS_TEXT(text_node))
c255a977 5943 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5944 else {
ee9b8eae 5945 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5946 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5947 }
c74f6de9 5948 else {
ee9b8eae
YO
5949
5950 /* Currently we only get here when
5951
5952 PL_rekind[OP(text_node)] == EXACT
5953
5954 if this changes back then the macro for IS_TEXT and
5955 friends need to change. */
79a2a0e8 5956 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1
DM
5957 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5958 is_utf8_pat))
c74f6de9
KW
5959 {
5960 sayNO;
5961 }
5962 }
1aa99e6b 5963 }
bbce6d69 5964 }
c255a977
DM
5965
5966 ST.A = scan;
5967 ST.B = next;
24d3c4a9 5968 if (minmod) {
eb72505d 5969 char *li = locinput;
24d3c4a9 5970 minmod = 0;
984e6dd1
DM
5971 if (ST.min &&
5972 regrepeat(rex, &li, ST.A, ST.min, depth, is_utf8_pat)
5973 < ST.min)
4633a7c4 5974 sayNO;
7016d6eb 5975 SET_locinput(li);
c255a977 5976 ST.count = ST.min;
c255a977
DM
5977 REGCP_SET(ST.cp);
5978 if (ST.c1 == CHRTEST_VOID)
5979 goto curly_try_B_min;
5980
5981 ST.oldloc = locinput;
5982
5983 /* set ST.maxpos to the furthest point along the
5984 * string that could possibly match */
5985 if (ST.max == REG_INFTY) {
5986 ST.maxpos = PL_regeol - 1;
f2ed9b32 5987 if (utf8_target)
c255a977
DM
5988 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5989 ST.maxpos--;
5990 }
f2ed9b32 5991 else if (utf8_target) {
c255a977
DM
5992 int m = ST.max - ST.min;
5993 for (ST.maxpos = locinput;
9a902117 5994 m >0 && ST.maxpos < PL_regeol; m--)
c255a977
DM
5995 ST.maxpos += UTF8SKIP(ST.maxpos);
5996 }
5997 else {
5998 ST.maxpos = locinput + ST.max - ST.min;
5999 if (ST.maxpos >= PL_regeol)
6000 ST.maxpos = PL_regeol - 1;
6001 }
6002 goto curly_try_B_min_known;
6003
6004 }
6005 else {
eb72505d
DM
6006 /* avoid taking address of locinput, so it can remain
6007 * a register var */
6008 char *li = locinput;
984e6dd1
DM
6009 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth,
6010 is_utf8_pat);
c255a977
DM
6011 if (ST.count < ST.min)
6012 sayNO;
7016d6eb 6013 SET_locinput(li);
c255a977
DM
6014 if ((ST.count > ST.min)
6015 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6016 {
6017 /* A{m,n} must come at the end of the string, there's
6018 * no point in backing off ... */
6019 ST.min = ST.count;
6020 /* ...except that $ and \Z can match before *and* after
6021 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6022 We may back off by one in this case. */
eb72505d 6023 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
6024 ST.min--;
6025 }
6026 REGCP_SET(ST.cp);
6027 goto curly_try_B_max;
6028 }
118e2215 6029 assert(0); /* NOTREACHED */
c255a977
DM
6030
6031
6032 case CURLY_B_min_known_fail:
6033 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 6034
c255a977 6035 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6036 if (ST.paren) {
6037 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6038 }
c255a977
DM
6039 /* Couldn't or didn't -- move forward. */
6040 ST.oldloc = locinput;
f2ed9b32 6041 if (utf8_target)
c255a977
DM
6042 locinput += UTF8SKIP(locinput);
6043 else
6044 locinput++;
6045 ST.count++;
6046 curly_try_B_min_known:
6047 /* find the next place where 'B' could work, then call B */
6048 {
6049 int n;
f2ed9b32 6050 if (utf8_target) {
c255a977
DM
6051 n = (ST.oldloc == locinput) ? 0 : 1;
6052 if (ST.c1 == ST.c2) {
c255a977 6053 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6054 while (locinput <= ST.maxpos
6055 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6056 {
6057 locinput += UTF8SKIP(locinput);
c255a977
DM
6058 n++;
6059 }
1aa99e6b
IH
6060 }
6061 else {
c255a977 6062 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6063 while (locinput <= ST.maxpos
6064 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6065 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6066 {
6067 locinput += UTF8SKIP(locinput);
c255a977 6068 n++;
1aa99e6b 6069 }
0fe9bf95
IZ
6070 }
6071 }
5400f398 6072 else { /* Not utf8_target */
c255a977
DM
6073 if (ST.c1 == ST.c2) {
6074 while (locinput <= ST.maxpos &&
6075 UCHARAT(locinput) != ST.c1)
6076 locinput++;
bbce6d69 6077 }
c255a977
DM
6078 else {
6079 while (locinput <= ST.maxpos
6080 && UCHARAT(locinput) != ST.c1
6081 && UCHARAT(locinput) != ST.c2)
6082 locinput++;
a0ed51b3 6083 }
c255a977
DM
6084 n = locinput - ST.oldloc;
6085 }
6086 if (locinput > ST.maxpos)
6087 sayNO;
c255a977 6088 if (n) {
eb72505d
DM
6089 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6090 * at b; check that everything between oldloc and
6091 * locinput matches */
6092 char *li = ST.oldloc;
c255a977 6093 ST.count += n;
984e6dd1 6094 if (regrepeat(rex, &li, ST.A, n, depth, is_utf8_pat) < n)
4633a7c4 6095 sayNO;
eb72505d 6096 assert(n == REG_INFTY || locinput == li);
a0d0e21e 6097 }
c255a977 6098 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6099 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6100 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6101 goto fake_end;
6102 }
4d5016e5 6103 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 6104 }
118e2215 6105 assert(0); /* NOTREACHED */
c255a977
DM
6106
6107
6108 case CURLY_B_min_fail:
6109 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
6110
6111 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6112 if (ST.paren) {
6113 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6114 }
c255a977 6115 /* failed -- move forward one */
f73aaa43 6116 {
eb72505d 6117 char *li = locinput;
984e6dd1 6118 if (!regrepeat(rex, &li, ST.A, 1, depth, is_utf8_pat)) {
f73aaa43
DM
6119 sayNO;
6120 }
eb72505d 6121 locinput = li;
f73aaa43
DM
6122 }
6123 {
c255a977 6124 ST.count++;
c255a977
DM
6125 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6126 ST.count > 0)) /* count overflow ? */
15272685 6127 {
c255a977
DM
6128 curly_try_B_min:
6129 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6130 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6131 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6132 goto fake_end;
6133 }
4d5016e5 6134 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6135 }
6136 }
c74f6de9 6137 sayNO;
118e2215 6138 assert(0); /* NOTREACHED */
c255a977
DM
6139
6140
6141 curly_try_B_max:
6142 /* a successful greedy match: now try to match B */
40d049e4 6143 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6144 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6145 goto fake_end;
6146 }
c255a977 6147 {
79a2a0e8
KW
6148 bool could_match = locinput < PL_regeol;
6149
c255a977 6150 /* If it could work, try it. */
79a2a0e8
KW
6151 if (ST.c1 != CHRTEST_VOID && could_match) {
6152 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6153 {
6154 could_match = memEQ(locinput,
6155 ST.c1_utf8,
6156 UTF8SKIP(locinput))
6157 || memEQ(locinput,
6158 ST.c2_utf8,
6159 UTF8SKIP(locinput));
6160 }
6161 else {
6162 could_match = UCHARAT(locinput) == ST.c1
6163 || UCHARAT(locinput) == ST.c2;
6164 }
6165 }
6166 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6167 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6168 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 6169 assert(0); /* NOTREACHED */
c255a977
DM
6170 }
6171 }
6172 /* FALL THROUGH */
3c0563b9 6173
c255a977
DM
6174 case CURLY_B_max_fail:
6175 /* failed to find B in a greedy match */
c255a977
DM
6176
6177 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6178 if (ST.paren) {
6179 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6180 }
c255a977
DM
6181 /* back up. */
6182 if (--ST.count < ST.min)
6183 sayNO;
eb72505d 6184 locinput = HOPc(locinput, -1);
c255a977
DM
6185 goto curly_try_B_max;
6186
6187#undef ST
6188
3c0563b9 6189 case END: /* last op of main pattern */
6bda09f9 6190 fake_end:
faec1544
DM
6191 if (cur_eval) {
6192 /* we've just finished A in /(??{A})B/; now continue with B */
984e6dd1
DM
6193 st->u.eval.saved_utf8_pat = is_utf8_pat;
6194 is_utf8_pat = cur_eval->u.eval.saved_utf8_pat;
faec1544 6195
288b8c02 6196 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
6197
6198 /* Save *all* the positions. */
6199 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b
DM
6200 rex_sv = cur_eval->u.eval.prev_rex;
6201 SET_reg_curpm(rex_sv);
8d919b0a 6202 rex = ReANY(rex_sv);
f8fc2ecf 6203 rexi = RXi_GET(rex);
faec1544 6204 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6205
faec1544 6206 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6207
6208 /* Restore parens of the outer rex without popping the
6209 * savestack */
92da3157
DM
6210 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6211 &maxopenparen);
faec1544
DM
6212
6213 st->u.eval.prev_eval = cur_eval;
6214 cur_eval = cur_eval->u.eval.prev_eval;
6215 DEBUG_EXECUTE_r(
2a49f0f5
JH
6216 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6217 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6218 if ( nochange_depth )
6219 nochange_depth--;
6220
4d5016e5
DM
6221 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6222 locinput); /* match B */
faec1544
DM
6223 }
6224
3b0527fe 6225 if (locinput < reginfo->till) {
a3621e74 6226 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6227 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6228 PL_colors[4],
6229 (long)(locinput - PL_reg_starttry),
3b0527fe 6230 (long)(reginfo->till - PL_reg_starttry),
7821416a 6231 PL_colors[5]));
58e23c8d 6232
262b90c4 6233 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6234 }
262b90c4 6235 sayYES; /* Success! */
dad79028
DM
6236
6237 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6238 DEBUG_EXECUTE_r(
6239 PerlIO_printf(Perl_debug_log,
6240 "%*s %ssubpattern success...%s\n",
5bc10b2c 6241 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6242 sayYES; /* Success! */
dad79028 6243
40a82448
DM
6244#undef ST
6245#define ST st->u.ifmatch
6246
37f53970
DM
6247 {
6248 char *newstart;
6249
40a82448
DM
6250 case SUSPEND: /* (?>A) */
6251 ST.wanted = 1;
37f53970 6252 newstart = locinput;
9041c2e3 6253 goto do_ifmatch;
dad79028 6254
40a82448
DM
6255 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6256 ST.wanted = 0;
dad79028
DM
6257 goto ifmatch_trivial_fail_test;
6258
40a82448
DM
6259 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6260 ST.wanted = 1;
dad79028 6261 ifmatch_trivial_fail_test:
a0ed51b3 6262 if (scan->flags) {
52657f30 6263 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6264 if (!s) {
6265 /* trivial fail */
24d3c4a9
DM
6266 if (logical) {
6267 logical = 0;
f2338a2e 6268 sw = 1 - cBOOL(ST.wanted);
dad79028 6269 }
40a82448 6270 else if (ST.wanted)
dad79028
DM
6271 sayNO;
6272 next = scan + ARG(scan);
6273 if (next == scan)
6274 next = NULL;
6275 break;
6276 }
37f53970 6277 newstart = s;
a0ed51b3
LW
6278 }
6279 else
37f53970 6280 newstart = locinput;
a0ed51b3 6281
c277df42 6282 do_ifmatch:
40a82448 6283 ST.me = scan;
24d3c4a9 6284 ST.logical = logical;
24d786f4
YO
6285 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6286
40a82448 6287 /* execute body of (?...A) */
37f53970 6288 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6289 assert(0); /* NOTREACHED */
37f53970 6290 }
40a82448
DM
6291
6292 case IFMATCH_A_fail: /* body of (?...A) failed */
6293 ST.wanted = !ST.wanted;
6294 /* FALL THROUGH */
6295
6296 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6297 if (ST.logical) {
f2338a2e 6298 sw = cBOOL(ST.wanted);
40a82448
DM
6299 }
6300 else if (!ST.wanted)
6301 sayNO;
6302
37f53970
DM
6303 if (OP(ST.me) != SUSPEND) {
6304 /* restore old position except for (?>...) */
6305 locinput = st->locinput;
40a82448
DM
6306 }
6307 scan = ST.me + ARG(ST.me);
6308 if (scan == ST.me)
6309 scan = NULL;
6310 continue; /* execute B */
6311
6312#undef ST
dad79028 6313
3c0563b9
DM
6314 case LONGJMP: /* alternative with many branches compiles to
6315 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6316 next = scan + ARG(scan);
6317 if (next == scan)
6318 next = NULL;
a0d0e21e 6319 break;
3c0563b9
DM
6320
6321 case COMMIT: /* (*COMMIT) */
e2e6a0f1
YO
6322 reginfo->cutpoint = PL_regeol;
6323 /* FALLTHROUGH */
3c0563b9
DM
6324
6325 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6326 if (!scan->flags)
ad64d0ec 6327 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6328 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6329 assert(0); /* NOTREACHED */
3c0563b9 6330
54612592
YO
6331 case COMMIT_next_fail:
6332 no_final = 1;
6333 /* FALLTHROUGH */
3c0563b9
DM
6334
6335 case OPFAIL: /* (*FAIL) */
7f69552c 6336 sayNO;
118e2215 6337 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6338
6339#define ST st->u.mark
3c0563b9 6340 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6341 ST.prev_mark = mark_state;
5d458dd8 6342 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6343 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6344 mark_state = st;
4d5016e5
DM
6345 ST.mark_loc = locinput;
6346 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6347 assert(0); /* NOTREACHED */
3c0563b9 6348
e2e6a0f1
YO
6349 case MARKPOINT_next:
6350 mark_state = ST.prev_mark;
6351 sayYES;
118e2215 6352 assert(0); /* NOTREACHED */
3c0563b9 6353
e2e6a0f1 6354 case MARKPOINT_next_fail:
5d458dd8 6355 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6356 {
6357 if (ST.mark_loc > startpoint)
6358 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6359 popmark = NULL; /* we found our mark */
6360 sv_commit = ST.mark_name;
6361
6362 DEBUG_EXECUTE_r({
5d458dd8 6363 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6364 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6365 REPORT_CODE_OFF+depth*2, "",
be2597df 6366 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6367 });
6368 }
6369 mark_state = ST.prev_mark;
5d458dd8
YO
6370 sv_yes_mark = mark_state ?
6371 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6372 sayNO;
118e2215 6373 assert(0); /* NOTREACHED */
3c0563b9
DM
6374
6375 case SKIP: /* (*SKIP) */
5d458dd8 6376 if (scan->flags) {
2bf803e2 6377 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6378 ST.mark_name = NULL;
e2e6a0f1 6379 ST.mark_loc = locinput;
4d5016e5 6380 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6381 } else {
2bf803e2 6382 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6383 otherwise do nothing. Meaning we need to scan
6384 */
6385 regmatch_state *cur = mark_state;
ad64d0ec 6386 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6387
6388 while (cur) {
6389 if ( sv_eq( cur->u.mark.mark_name,
6390 find ) )
6391 {
6392 ST.mark_name = find;
4d5016e5 6393 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6394 }
6395 cur = cur->u.mark.prev_mark;
6396 }
e2e6a0f1 6397 }
2bf803e2 6398 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6399 break;
3c0563b9 6400
5d458dd8
YO
6401 case SKIP_next_fail:
6402 if (ST.mark_name) {
6403 /* (*CUT:NAME) - Set up to search for the name as we
6404 collapse the stack*/
6405 popmark = ST.mark_name;
6406 } else {
6407 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6408 if (ST.mark_loc > startpoint)
6409 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6410 /* but we set sv_commit to latest mark_name if there
6411 is one so they can test to see how things lead to this
6412 cut */
6413 if (mark_state)
6414 sv_commit=mark_state->u.mark.mark_name;
6415 }
e2e6a0f1
YO
6416 no_final = 1;
6417 sayNO;
118e2215 6418 assert(0); /* NOTREACHED */
e2e6a0f1 6419#undef ST
3c0563b9
DM
6420
6421 case LNBREAK: /* \R */
7016d6eb 6422 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
e1d1eefb 6423 locinput += n;
e1d1eefb
YO
6424 } else
6425 sayNO;
6426 break;
6427
a0d0e21e 6428 default:
b900a521 6429 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6430 PTR2UV(scan), OP(scan));
cea2e8a9 6431 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6432
6433 /* this is a point to jump to in order to increment
6434 * locinput by one character */
6435 increment_locinput:
e6ca698c 6436 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6437 if (utf8_target) {
6438 locinput += PL_utf8skip[nextchr];
7016d6eb 6439 /* locinput is allowed to go 1 char off the end, but not 2+ */
28b98f76
DM
6440 if (locinput > PL_regeol)
6441 sayNO;
28b98f76
DM
6442 }
6443 else
3640db6b 6444 locinput++;
28b98f76 6445 break;
5d458dd8
YO
6446
6447 } /* end switch */
95b24440 6448
5d458dd8
YO
6449 /* switch break jumps here */
6450 scan = next; /* prepare to execute the next op and ... */
6451 continue; /* ... jump back to the top, reusing st */
118e2215 6452 assert(0); /* NOTREACHED */
95b24440 6453
40a82448
DM
6454 push_yes_state:
6455 /* push a state that backtracks on success */
6456 st->u.yes.prev_yes_state = yes_state;
6457 yes_state = st;
6458 /* FALL THROUGH */
6459 push_state:
6460 /* push a new regex state, then continue at scan */
6461 {
6462 regmatch_state *newst;
6463
24b23f37
YO
6464 DEBUG_STACK_r({
6465 regmatch_state *cur = st;
6466 regmatch_state *curyes = yes_state;
6467 int curd = depth;
6468 regmatch_slab *slab = PL_regmatch_slab;
6469 for (;curd > -1;cur--,curd--) {
6470 if (cur < SLAB_FIRST(slab)) {
6471 slab = slab->prev;
6472 cur = SLAB_LAST(slab);
6473 }
6474 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6475 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6476 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6477 (curyes == cur) ? "yes" : ""
6478 );
6479 if (curyes == cur)
6480 curyes = cur->u.yes.prev_yes_state;
6481 }
6482 } else
6483 DEBUG_STATE_pp("push")
6484 );
40a82448 6485 depth++;
40a82448
DM
6486 st->locinput = locinput;
6487 newst = st+1;
6488 if (newst > SLAB_LAST(PL_regmatch_slab))
6489 newst = S_push_slab(aTHX);
6490 PL_regmatch_state = newst;
786e8c11 6491
4d5016e5 6492 locinput = pushinput;
40a82448
DM
6493 st = newst;
6494 continue;
118e2215 6495 assert(0); /* NOTREACHED */
40a82448 6496 }
a0d0e21e 6497 }
a687059c 6498
a0d0e21e
LW
6499 /*
6500 * We get here only if there's trouble -- normally "case END" is
6501 * the terminating point.
6502 */
cea2e8a9 6503 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6504 /*NOTREACHED*/
4633a7c4
LW
6505 sayNO;
6506
262b90c4 6507yes:
77cb431f
DM
6508 if (yes_state) {
6509 /* we have successfully completed a subexpression, but we must now
6510 * pop to the state marked by yes_state and continue from there */
77cb431f 6511 assert(st != yes_state);
5bc10b2c
DM
6512#ifdef DEBUGGING
6513 while (st != yes_state) {
6514 st--;
6515 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6516 PL_regmatch_slab = PL_regmatch_slab->prev;
6517 st = SLAB_LAST(PL_regmatch_slab);
6518 }
e2e6a0f1 6519 DEBUG_STATE_r({
54612592
YO
6520 if (no_final) {
6521 DEBUG_STATE_pp("pop (no final)");
6522 } else {
6523 DEBUG_STATE_pp("pop (yes)");
6524 }
e2e6a0f1 6525 });
5bc10b2c
DM
6526 depth--;
6527 }
6528#else
77cb431f
DM
6529 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6530 || yes_state > SLAB_LAST(PL_regmatch_slab))
6531 {
6532 /* not in this slab, pop slab */
6533 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6534 PL_regmatch_slab = PL_regmatch_slab->prev;
6535 st = SLAB_LAST(PL_regmatch_slab);
6536 }
6537 depth -= (st - yes_state);
5bc10b2c 6538#endif
77cb431f
DM
6539 st = yes_state;
6540 yes_state = st->u.yes.prev_yes_state;
6541 PL_regmatch_state = st;
24b23f37 6542
3640db6b 6543 if (no_final)
5d458dd8 6544 locinput= st->locinput;
54612592 6545 state_num = st->resume_state + no_final;
24d3c4a9 6546 goto reenter_switch;
77cb431f
DM
6547 }
6548
a3621e74 6549 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6550 PL_colors[4], PL_colors[5]));
02db2b7b 6551
ed301438 6552 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
6553 /* each successfully executed (?{...}) block does the equivalent of
6554 * local $^R = do {...}
6555 * When popping the save stack, all these locals would be undone;
6556 * bypass this by setting the outermost saved $^R to the latest
6557 * value */
6558 if (oreplsv != GvSV(PL_replgv))
6559 sv_setsv(oreplsv, GvSV(PL_replgv));
6560 }
95b24440 6561 result = 1;
aa283a38 6562 goto final_exit;
4633a7c4
LW
6563
6564no:
a3621e74 6565 DEBUG_EXECUTE_r(
7821416a 6566 PerlIO_printf(Perl_debug_log,
786e8c11 6567 "%*s %sfailed...%s\n",
5bc10b2c 6568 REPORT_CODE_OFF+depth*2, "",
786e8c11 6569 PL_colors[4], PL_colors[5])
7821416a 6570 );
aa283a38 6571
262b90c4 6572no_silent:
54612592
YO
6573 if (no_final) {
6574 if (yes_state) {
6575 goto yes;
6576 } else {
6577 goto final_exit;
6578 }
6579 }
aa283a38
DM
6580 if (depth) {
6581 /* there's a previous state to backtrack to */
40a82448
DM
6582 st--;
6583 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6584 PL_regmatch_slab = PL_regmatch_slab->prev;
6585 st = SLAB_LAST(PL_regmatch_slab);
6586 }
6587 PL_regmatch_state = st;
40a82448 6588 locinput= st->locinput;
40a82448 6589
5bc10b2c
DM
6590 DEBUG_STATE_pp("pop");
6591 depth--;
262b90c4
DM
6592 if (yes_state == st)
6593 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6594
24d3c4a9
DM
6595 state_num = st->resume_state + 1; /* failure = success + 1 */
6596 goto reenter_switch;
95b24440 6597 }
24d3c4a9 6598 result = 0;
aa283a38 6599
262b90c4 6600 final_exit:
bbe252da 6601 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6602 SV *sv_err = get_sv("REGERROR", 1);
6603 SV *sv_mrk = get_sv("REGMARK", 1);
6604 if (result) {
e2e6a0f1 6605 sv_commit = &PL_sv_no;
5d458dd8
YO
6606 if (!sv_yes_mark)
6607 sv_yes_mark = &PL_sv_yes;
6608 } else {
6609 if (!sv_commit)
6610 sv_commit = &PL_sv_yes;
6611 sv_yes_mark = &PL_sv_no;
6612 }
6613 sv_setsv(sv_err, sv_commit);
6614 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6615 }
19b95bf0 6616
81ed78b2
DM
6617
6618 if (last_pushed_cv) {
6619 dSP;
6620 POP_MULTICALL;
4f8dbb2d 6621 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6622 }
6623
2f554ef7
DM
6624 /* clean up; in particular, free all slabs above current one */
6625 LEAVE_SCOPE(oldsave);
5d9a96ca 6626
730f4c74
DM
6627 assert(!result || locinput - PL_bostr >= 0);
6628 return result ? locinput - PL_bostr : -1;
a687059c
LW
6629}
6630
6631/*
6632 - regrepeat - repeatedly match something simple, report how many
d60de1d1 6633 *
e64f369d
KW
6634 * What 'simple' means is a node which can be the operand of a quantifier like
6635 * '+', or {1,3}
6636 *
d60de1d1
DM
6637 * startposp - pointer a pointer to the start position. This is updated
6638 * to point to the byte following the highest successful
6639 * match.
6640 * p - the regnode to be repeatedly matched against.
4063ade8 6641 * max - maximum number of things to match.
d60de1d1 6642 * depth - (for debugging) backtracking depth.
a687059c 6643 */
76e3520e 6644STATIC I32
272d35c9 6645S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
984e6dd1 6646 I32 max, int depth, bool is_utf8_pat)
a687059c 6647{
27da23d5 6648 dVAR;
4063ade8 6649 char *scan; /* Pointer to current position in target string */
eb578fdb 6650 I32 c;
4063ade8
KW
6651 char *loceol = PL_regeol; /* local version */
6652 I32 hardcount = 0; /* How many matches so far */
eb578fdb 6653 bool utf8_target = PL_reg_match_utf8;
3018b823 6654 int to_complement = 0; /* Invert the result? */
d513472c 6655 UV utf8_flags;
3018b823 6656 _char_class_number classnum;
4f55667c
SP
6657#ifndef DEBUGGING
6658 PERL_UNUSED_ARG(depth);
6659#endif
a0d0e21e 6660
7918f24d
NC
6661 PERL_ARGS_ASSERT_REGREPEAT;
6662
f73aaa43 6663 scan = *startposp;
faf11cac
HS
6664 if (max == REG_INFTY)
6665 max = I32_MAX;
4063ade8 6666 else if (! utf8_target && scan + max < loceol)
7f596f4c 6667 loceol = scan + max;
4063ade8
KW
6668
6669 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6670 * to the maximum of how far we should go in it (leaving it set to the real
6671 * end, if the maximum permissible would take us beyond that). This allows
6672 * us to make the loop exit condition that we haven't gone past <loceol> to
6673 * also mean that we haven't exceeded the max permissible count, saving a
6674 * test each time through the loop. But it assumes that the OP matches a
6675 * single byte, which is true for most of the OPs below when applied to a
6676 * non-UTF-8 target. Those relatively few OPs that don't have this
6677 * characteristic will have to compensate.
6678 *
6679 * There is no adjustment for UTF-8 targets, as the number of bytes per
6680 * character varies. OPs will have to test both that the count is less
6681 * than the max permissible (using <hardcount> to keep track), and that we
6682 * are still within the bounds of the string (using <loceol>. A few OPs
6683 * match a single byte no matter what the encoding. They can omit the max
6684 * test if, for the UTF-8 case, they do the adjustment that was skipped
6685 * above.
6686 *
6687 * Thus, the code above sets things up for the common case; and exceptional
6688 * cases need extra work; the common case is to make sure <scan> doesn't
6689 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6690 * count doesn't exceed the maximum permissible */
6691
a0d0e21e 6692 switch (OP(p)) {
22c35a8c 6693 case REG_ANY:
f2ed9b32 6694 if (utf8_target) {
1aa99e6b 6695 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6696 scan += UTF8SKIP(scan);
6697 hardcount++;
6698 }
6699 } else {
6700 while (scan < loceol && *scan != '\n')
6701 scan++;
a0ed51b3
LW
6702 }
6703 break;
ffc61ed2 6704 case SANY:
f2ed9b32 6705 if (utf8_target) {
a0804c9e 6706 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6707 scan += UTF8SKIP(scan);
6708 hardcount++;
6709 }
6710 }
6711 else
6712 scan = loceol;
a0ed51b3 6713 break;
4063ade8
KW
6714 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6715 if (utf8_target && scan + max < loceol) {
6716
6717 /* <loceol> hadn't been adjusted in the UTF-8 case */
6718 scan += max;
6719 }
6720 else {
6721 scan = loceol;
6722 }
f33976b4 6723 break;
59d32103 6724 case EXACT:
984e6dd1 6725 assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6726
59d32103 6727 c = (U8)*STRING(p);
59d32103 6728
5e4a1da1
KW
6729 /* Can use a simple loop if the pattern char to match on is invariant
6730 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6731 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6732 * true iff it doesn't matter if the argument is in UTF-8 or not */
984e6dd1 6733 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! is_utf8_pat)) {
4063ade8
KW
6734 if (utf8_target && scan + max < loceol) {
6735 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6736 * since here, to match at all, 1 char == 1 byte */
6737 loceol = scan + max;
6738 }
59d32103
KW
6739 while (scan < loceol && UCHARAT(scan) == c) {
6740 scan++;
6741 }
6742 }
984e6dd1 6743 else if (is_utf8_pat) {
5e4a1da1
KW
6744 if (utf8_target) {
6745 STRLEN scan_char_len;
5e4a1da1 6746
4063ade8 6747 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
6748 * string EQ */
6749 while (hardcount < max
9a902117
KW
6750 && scan < loceol
6751 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
6752 && memEQ(scan, STRING(p), scan_char_len))
6753 {
4200a00c 6754 scan += scan_char_len;
5e4a1da1
KW
6755 hardcount++;
6756 }
6757 }
6758 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 6759
5e4a1da1
KW
6760 /* Target isn't utf8; convert the character in the UTF-8
6761 * pattern to non-UTF8, and do a simple loop */
6762 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6763 while (scan < loceol && UCHARAT(scan) == c) {
6764 scan++;
6765 }
6766 } /* else pattern char is above Latin1, can't possibly match the
6767 non-UTF-8 target */
b40a2c17 6768 }
5e4a1da1 6769 else {
59d32103 6770
5e4a1da1
KW
6771 /* Here, the string must be utf8; pattern isn't, and <c> is
6772 * different in utf8 than not, so can't compare them directly.
6773 * Outside the loop, find the two utf8 bytes that represent c, and
6774 * then look for those in sequence in the utf8 string */
59d32103
KW
6775 U8 high = UTF8_TWO_BYTE_HI(c);
6776 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
6777
6778 while (hardcount < max
6779 && scan + 1 < loceol
6780 && UCHARAT(scan) == high
6781 && UCHARAT(scan + 1) == low)
6782 {
6783 scan += 2;
6784 hardcount++;
6785 }
6786 }
6787 break;
5e4a1da1 6788
2f7f8cb1
KW
6789 case EXACTFA:
6790 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6791 goto do_exactf;
6792
d4e0b827 6793 case EXACTFL:
272d35c9 6794 RXp_MATCH_TAINTED_on(prog);
17580e7a
KW
6795 utf8_flags = FOLDEQ_UTF8_LOCALE;
6796 goto do_exactf;
6797
d4e0b827 6798 case EXACTF:
62bf7766
KW
6799 utf8_flags = 0;
6800 goto do_exactf;
6801
3c760661 6802 case EXACTFU_SS:
fab2782b 6803 case EXACTFU_TRICKYFOLD:
9a5a5549 6804 case EXACTFU:
984e6dd1 6805 utf8_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6806
613a425d
KW
6807 do_exactf: {
6808 int c1, c2;
6809 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 6810
984e6dd1 6811 assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6812
984e6dd1
DM
6813 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6814 is_utf8_pat))
6815 {
613a425d 6816 if (c1 == CHRTEST_VOID) {
49b95fad 6817 /* Use full Unicode fold matching */
4063ade8 6818 char *tmpeol = PL_regeol;
984e6dd1 6819 STRLEN pat_len = is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
6820 while (hardcount < max
6821 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6822 STRING(p), NULL, pat_len,
984e6dd1 6823 is_utf8_pat, utf8_flags))
49b95fad
KW
6824 {
6825 scan = tmpeol;
4063ade8 6826 tmpeol = PL_regeol;
49b95fad
KW
6827 hardcount++;
6828 }
613a425d
KW
6829 }
6830 else if (utf8_target) {
6831 if (c1 == c2) {
4063ade8
KW
6832 while (scan < loceol
6833 && hardcount < max
613a425d
KW
6834 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6835 {
6836 scan += UTF8SKIP(scan);
6837 hardcount++;
6838 }
6839 }
6840 else {
4063ade8
KW
6841 while (scan < loceol
6842 && hardcount < max
613a425d
KW
6843 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6844 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6845 {
6846 scan += UTF8SKIP(scan);
6847 hardcount++;
6848 }
6849 }
6850 }
6851 else if (c1 == c2) {
6852 while (scan < loceol && UCHARAT(scan) == c1) {
6853 scan++;
6854 }
6855 }
6856 else {
6857 while (scan < loceol &&
6858 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6859 {
6860 scan++;
6861 }
6862 }
634c83a2 6863 }
bbce6d69 6864 break;
613a425d 6865 }
a0d0e21e 6866 case ANYOF:
954a2af6 6867 case ANYOF_WARN_SUPER:
e0193e47 6868 if (utf8_target) {
4e8910e0 6869 while (hardcount < max
9a902117 6870 && scan < loceol
635cd5d4 6871 && reginclass(prog, p, (U8*)scan, utf8_target))
4e8910e0 6872 {
9a902117 6873 scan += UTF8SKIP(scan);
ffc61ed2
JH
6874 hardcount++;
6875 }
6876 } else {
32fc9b6a 6877 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6878 scan++;
6879 }
a0d0e21e 6880 break;
4063ade8 6881
3018b823 6882 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 6883
3018b823
KW
6884 case NPOSIXL:
6885 to_complement = 1;
6886 /* FALLTHROUGH */
980866de 6887
3018b823 6888 case POSIXL:
272d35c9 6889 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
6890 if (! utf8_target) {
6891 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6892 *scan)))
a12cf05f 6893 {
3018b823
KW
6894 scan++;
6895 }
6896 } else {
6897 while (hardcount < max && scan < loceol
6898 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6899 (U8 *) scan)))
6900 {
6901 scan += UTF8SKIP(scan);
ffc61ed2
JH
6902 hardcount++;
6903 }
a0ed51b3
LW
6904 }
6905 break;
0658cdde 6906
3018b823
KW
6907 case POSIXD:
6908 if (utf8_target) {
6909 goto utf8_posix;
6910 }
6911 /* FALLTHROUGH */
6912
0658cdde 6913 case POSIXA:
4063ade8
KW
6914 if (utf8_target && scan + max < loceol) {
6915
7aee35ff
KW
6916 /* We didn't adjust <loceol> at the beginning of this routine
6917 * because is UTF-8, but it is actually ok to do so, since here, to
6918 * match, 1 char == 1 byte. */
4063ade8
KW
6919 loceol = scan + max;
6920 }
6921 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
6922 scan++;
6923 }
6924 break;
980866de 6925
3018b823
KW
6926 case NPOSIXD:
6927 if (utf8_target) {
6928 to_complement = 1;
6929 goto utf8_posix;
6930 }
6931 /* FALL THROUGH */
980866de 6932
3018b823
KW
6933 case NPOSIXA:
6934 if (! utf8_target) {
6935 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
6936 scan++;
6937 }
4063ade8 6938 }
3018b823 6939 else {
980866de 6940
3018b823
KW
6941 /* The complement of something that matches only ASCII matches all
6942 * UTF-8 variant code points, plus everything in ASCII that isn't
6943 * in the class. */
bedac28b 6944 while (hardcount < max && scan < loceol
3018b823
KW
6945 && (! UTF8_IS_INVARIANT(*scan)
6946 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 6947 {
3018b823 6948 scan += UTF8SKIP(scan);
ffc61ed2
JH
6949 hardcount++;
6950 }
3018b823
KW
6951 }
6952 break;
980866de 6953
3018b823
KW
6954 case NPOSIXU:
6955 to_complement = 1;
6956 /* FALLTHROUGH */
6957
6958 case POSIXU:
6959 if (! utf8_target) {
6960 while (scan < loceol && to_complement
6961 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 6962 {
3018b823
KW
6963 scan++;
6964 }
cfaf538b
KW
6965 }
6966 else {
3018b823
KW
6967 utf8_posix:
6968 classnum = (_char_class_number) FLAGS(p);
6969 if (classnum < _FIRST_NON_SWASH_CC) {
6970
6971 /* Here, a swash is needed for above-Latin1 code points.
6972 * Process as many Latin1 code points using the built-in rules.
6973 * Go to another loop to finish processing upon encountering
6974 * the first Latin1 code point. We could do that in this loop
6975 * as well, but the other way saves having to test if the swash
6976 * has been loaded every time through the loop: extra space to
6977 * save a test. */
6978 while (hardcount < max && scan < loceol) {
6979 if (UTF8_IS_INVARIANT(*scan)) {
6980 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6981 classnum))))
6982 {
6983 break;
6984 }
6985 scan++;
6986 }
6987 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6988 if (! (to_complement
6989 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6990 *(scan + 1)),
6991 classnum))))
6992 {
6993 break;
6994 }
6995 scan += 2;
6996 }
6997 else {
6998 goto found_above_latin1;
6999 }
7000
7001 hardcount++;
7002 }
7003 }
7004 else {
7005 /* For these character classes, the knowledge of how to handle
7006 * every code point is compiled in to Perl via a macro. This
7007 * code is written for making the loops as tight as possible.
7008 * It could be refactored to save space instead */
7009 switch (classnum) {
7010 case _CC_ENUM_SPACE: /* XXX would require separate code
7011 if we revert the change of \v
7012 matching this */
7013 /* FALL THROUGH */
7014 case _CC_ENUM_PSXSPC:
7015 while (hardcount < max
7016 && scan < loceol
7017 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7018 {
7019 scan += UTF8SKIP(scan);
7020 hardcount++;
7021 }
7022 break;
7023 case _CC_ENUM_BLANK:
7024 while (hardcount < max
7025 && scan < loceol
7026 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7027 {
7028 scan += UTF8SKIP(scan);
7029 hardcount++;
7030 }
7031 break;
7032 case _CC_ENUM_XDIGIT:
7033 while (hardcount < max
7034 && scan < loceol
7035 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7036 {
7037 scan += UTF8SKIP(scan);
7038 hardcount++;
7039 }
7040 break;
7041 case _CC_ENUM_VERTSPACE:
7042 while (hardcount < max
7043 && scan < loceol
7044 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7045 {
7046 scan += UTF8SKIP(scan);
7047 hardcount++;
7048 }
7049 break;
7050 case _CC_ENUM_CNTRL:
7051 while (hardcount < max
7052 && scan < loceol
7053 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7054 {
7055 scan += UTF8SKIP(scan);
7056 hardcount++;
7057 }
7058 break;
7059 default:
7060 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7061 }
7062 }
a0ed51b3 7063 }
3018b823 7064 break;
4063ade8 7065
3018b823
KW
7066 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7067
7068 /* Load the swash if not already present */
7069 if (! PL_utf8_swash_ptrs[classnum]) {
7070 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7071 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7072 "utf8", swash_property_names[classnum],
7073 &PL_sv_undef, 1, 0, NULL, &flags);
4063ade8 7074 }
3018b823
KW
7075
7076 while (hardcount < max && scan < loceol
7077 && to_complement ^ cBOOL(_generic_utf8(
7078 classnum,
7079 scan,
7080 swash_fetch(PL_utf8_swash_ptrs[classnum],
7081 (U8 *) scan,
7082 TRUE))))
7083 {
7084 scan += UTF8SKIP(scan);
7085 hardcount++;
7086 }
7087 break;
7088
e1d1eefb 7089 case LNBREAK:
e64f369d
KW
7090 if (utf8_target) {
7091 while (hardcount < max && scan < loceol &&
7092 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7093 scan += c;
7094 hardcount++;
7095 }
7096 } else {
7097 /* LNBREAK can match one or two latin chars, which is ok, but we
7098 * have to use hardcount in this situation, and throw away the
7099 * adjustment to <loceol> done before the switch statement */
7100 loceol = PL_regeol;
7101 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7102 scan+=c;
7103 hardcount++;
7104 }
7105 }
7106 break;
e1d1eefb 7107
584b1f02
KW
7108 case BOUND:
7109 case BOUNDA:
7110 case BOUNDL:
7111 case BOUNDU:
7112 case EOS:
7113 case GPOS:
7114 case KEEPS:
7115 case NBOUND:
7116 case NBOUNDA:
7117 case NBOUNDL:
7118 case NBOUNDU:
7119 case OPFAIL:
7120 case SBOL:
7121 case SEOL:
7122 /* These are all 0 width, so match right here or not at all. */
7123 break;
7124
7125 default:
7126 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7127 assert(0); /* NOTREACHED */
7128
a0d0e21e 7129 }
a687059c 7130
a0ed51b3
LW
7131 if (hardcount)
7132 c = hardcount;
7133 else
f73aaa43
DM
7134 c = scan - *startposp;
7135 *startposp = scan;
a687059c 7136
a3621e74 7137 DEBUG_r({
e68ec53f 7138 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7139 DEBUG_EXECUTE_r({
e68ec53f
YO
7140 SV * const prop = sv_newmortal();
7141 regprop(prog, prop, p);
7142 PerlIO_printf(Perl_debug_log,
be8e71aa 7143 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7144 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7145 });
be8e71aa 7146 });
9041c2e3 7147
a0d0e21e 7148 return(c);
a687059c
LW
7149}
7150
c277df42 7151
be8e71aa 7152#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7153/*
6c6525b8 7154- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7155create a copy so that changes the caller makes won't change the shared one.
7156If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7157 */
ffc61ed2 7158SV *
5aaab254 7159Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7160{
6c6525b8 7161 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7162
7163 if (altsvp) {
7164 *altsvp = NULL;
7165 }
7166
7167 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
6c6525b8
KW
7168}
7169#endif
7170
7171STATIC SV *
5aaab254 7172S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
6c6525b8 7173{
8c9eb58f
KW
7174 /* Returns the swash for the input 'node' in the regex 'prog'.
7175 * If <doinit> is true, will attempt to create the swash if not already
7176 * done.
7177 * If <listsvp> is non-null, will return the swash initialization string in
7178 * it.
8c9eb58f
KW
7179 * Tied intimately to how regcomp.c sets up the data structure */
7180
97aff369 7181 dVAR;
9e55ce06
JH
7182 SV *sw = NULL;
7183 SV *si = NULL;
7a6c6baa
KW
7184 SV* invlist = NULL;
7185
f8fc2ecf
YO
7186 RXi_GET_DECL(prog,progi);
7187 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7188
6c6525b8 7189 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 7190
ccb2541c
KW
7191 assert(ANYOF_NONBITMAP(node));
7192
4f639d21 7193 if (data && data->count) {
a3b680e6 7194 const U32 n = ARG(node);
ffc61ed2 7195
4f639d21 7196 if (data->what[n] == 's') {
ad64d0ec
NC
7197 SV * const rv = MUTABLE_SV(data->data[n]);
7198 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7199 SV **const ary = AvARRAY(av);
87367d5f 7200 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7201
8c9eb58f 7202 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7203
88675427
KW
7204 /* Elements 2 and 3 are either both present or both absent. [2] is
7205 * any inversion list generated at compile time; [3] indicates if
7a6c6baa 7206 * that inversion list has any user-defined properties in it. */
88675427
KW
7207 if (av_len(av) >= 2) {
7208 invlist = ary[2];
7209 if (SvUV(ary[3])) {
83199d38
KW
7210 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7211 }
7a6c6baa
KW
7212 }
7213 else {
7214 invlist = NULL;
7a6c6baa
KW
7215 }
7216
8c9eb58f
KW
7217 /* Element [1] is reserved for the set-up swash. If already there,
7218 * return it; if not, create it and store it there */
f192cf32
KW
7219 if (SvROK(ary[1])) {
7220 sw = ary[1];
7221 }
ffc61ed2 7222 else if (si && doinit) {
7a6c6baa
KW
7223
7224 sw = _core_swash_init("utf8", /* the utf8 package */
7225 "", /* nameless */
7226 si,
7227 1, /* binary */
7228 0, /* not from tr/// */
7a6c6baa 7229 invlist,
83199d38 7230 &swash_init_flags);
ffc61ed2
JH
7231 (void)av_store(av, 1, sw);
7232 }
7233 }
7234 }
7235
7a6c6baa
KW
7236 if (listsvp) {
7237 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
7238
7239 /* Use the swash, if any, which has to have incorporated into it all
7240 * possibilities */
872dd7e0
KW
7241 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7242 && (si && si != &PL_sv_undef))
7243 {
7a6c6baa 7244
872dd7e0 7245 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
7246 sv_catsv(matches_string, si);
7247 }
7248
7249 /* Add the inversion list to whatever we have. This may have come from
7250 * the swash, or from an input parameter */
7251 if (invlist) {
7252 sv_catsv(matches_string, _invlist_contents(invlist));
7253 }
7254 *listsvp = matches_string;
7255 }
7256
ffc61ed2
JH
7257 return sw;
7258}
7259
7260/*
ba7b4546 7261 - reginclass - determine if a character falls into a character class
832705d4 7262
6698fab5
KW
7263 n is the ANYOF regnode
7264 p is the target string
6698fab5 7265 utf8_target tells whether p is in UTF-8.
832705d4 7266
635cd5d4 7267 Returns true if matched; false otherwise.
eba1359e 7268
d5788240
KW
7269 Note that this can be a synthetic start class, a combination of various
7270 nodes, so things you think might be mutually exclusive, such as locale,
7271 aren't. It can match both locale and non-locale
7272
bbce6d69 7273 */
7274
76e3520e 7275STATIC bool
272d35c9 7276S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
bbce6d69 7277{
27da23d5 7278 dVAR;
a3b680e6 7279 const char flags = ANYOF_FLAGS(n);
bbce6d69 7280 bool match = FALSE;
cc07378b 7281 UV c = *p;
1aa99e6b 7282
7918f24d
NC
7283 PERL_ARGS_ASSERT_REGINCLASS;
7284
afd2eb18
KW
7285 /* If c is not already the code point, get it. Note that
7286 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7287 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7288 STRLEN c_len = 0;
f7ab54c6 7289 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
7290 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7291 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7292 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7293 * UTF8_ALLOW_FFFF */
f7ab54c6 7294 if (c_len == (STRLEN)-1)
e8a70c6f 7295 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7296 }
4b3cda86 7297
7cdde544
KW
7298 /* If this character is potentially in the bitmap, check it */
7299 if (c < 256) {
ffc61ed2
JH
7300 if (ANYOF_BITMAP_TEST(n, c))
7301 match = TRUE;
11454c59
KW
7302 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7303 && ! utf8_target
7304 && ! isASCII(c))
7305 {
7306 match = TRUE;
7307 }
78969a98 7308 else if (flags & ANYOF_LOCALE) {
272d35c9 7309 RXp_MATCH_TAINTED_on(prog);
78969a98 7310
538b546e 7311 if ((flags & ANYOF_LOC_FOLD)
78969a98
KW
7312 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7313 {
ffc61ed2 7314 match = TRUE;
78969a98 7315 }
31c7f561
KW
7316 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7317
7318 /* The data structure is arranged so bits 0, 2, 4, ... are set
7319 * if the class includes the Posix character class given by
7320 * bit/2; and 1, 3, 5, ... are set if the class includes the
7321 * complemented Posix class given by int(bit/2). So we loop
7322 * through the bits, each time changing whether we complement
7323 * the result or not. Suppose for the sake of illustration
7324 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7325 * is set, it means there is a match for this ANYOF node if the
7326 * character is in the class given by the expression (0 / 2 = 0
7327 * = \w). If it is in that class, isFOO_lc() will return 1,
7328 * and since 'to_complement' is 0, the result will stay TRUE,
7329 * and we exit the loop. Suppose instead that bit 0 is 0, but
7330 * bit 1 is 1. That means there is a match if the character
7331 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7332 * but will on bit 1. On the second iteration 'to_complement'
7333 * will be 1, so the exclusive or will reverse things, so we
7334 * are testing for \W. On the third iteration, 'to_complement'
7335 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
7336 * iteration would test for \S, etc.
7337 *
7338 * Note that this code assumes that all the classes are closed
7339 * under folding. For example, if a character matches \w, then
7340 * its fold does too; and vice versa. This should be true for
7341 * any well-behaved locale for all the currently defined Posix
7342 * classes, except for :lower: and :upper:, which are handled
7343 * by the pseudo-class :cased: which matches if either of the
7344 * other two does. To get rid of this assumption, an outer
7345 * loop could be used below to iterate over both the source
7346 * character, and its fold (if different) */
31c7f561
KW
7347
7348 int count = 0;
7349 int to_complement = 0;
7350 while (count < ANYOF_MAX) {
7351 if (ANYOF_CLASS_TEST(n, count)
7352 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7353 {
7354 match = TRUE;
7355 break;
7356 }
7357 count++;
7358 to_complement ^= 1;
7359 }
ffc61ed2 7360 }
a0ed51b3 7361 }
a0ed51b3
LW
7362 }
7363
7cdde544 7364 /* If the bitmap didn't (or couldn't) match, and something outside the
1f327b5e 7365 * bitmap could match, try that. Locale nodes specify completely the
de87c4fe 7366 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 7367 * cause them to be treated as Unicode and not locale), except in
de87c4fe 7368 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
7369 * could be a combination of locale and non-locale nodes. So allow locale
7370 * to match for the synthetic start class, which will give a false
7371 * positive that will be resolved when the match is done again as not part
7372 * of the synthetic start class */
ef87b810 7373 if (!match) {
10ee90d2
KW
7374 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7375 match = TRUE; /* Everything above 255 matches */
e051a21d 7376 }
6f8d7d0d
KW
7377 else if (ANYOF_NONBITMAP(n)
7378 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7379 || (utf8_target
7380 && (c >=256
7381 || (! (flags & ANYOF_LOCALE))
9aa1e39f 7382 || OP(n) == ANYOF_SYNTHETIC))))
ef87b810 7383 {
e0193e47 7384 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7cdde544
KW
7385 if (sw) {
7386 U8 * utf8_p;
7387 if (utf8_target) {
7388 utf8_p = (U8 *) p;
e0193e47
KW
7389 } else { /* Convert to utf8 */
7390 STRLEN len = 1;
7cdde544
KW
7391 utf8_p = bytes_to_utf8(p, &len);
7392 }
f56b6394 7393
e0193e47 7394 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7395 match = TRUE;
e0193e47 7396 }
7cdde544
KW
7397
7398 /* If we allocated a string above, free it */
7399 if (! utf8_target) Safefree(utf8_p);
7400 }
7401 }
5073ffbd
KW
7402
7403 if (UNICODE_IS_SUPER(c)
954a2af6 7404 && OP(n) == ANYOF_WARN_SUPER
5073ffbd
KW
7405 && ckWARN_d(WARN_NON_UNICODE))
7406 {
7407 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7408 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7409 }
7cdde544
KW
7410 }
7411
f0fdc1c9
KW
7412 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7413 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7414}
161b471a 7415
dfe13c55 7416STATIC U8 *
0ce71af7 7417S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7418{
6af86488
KW
7419 /* return the position 'off' UTF-8 characters away from 's', forward if
7420 * 'off' >= 0, backwards if negative. But don't go outside of position
7421 * 'lim', which better be < s if off < 0 */
7422
97aff369 7423 dVAR;
7918f24d
NC
7424
7425 PERL_ARGS_ASSERT_REGHOP3;
7426
a0ed51b3 7427 if (off >= 0) {
1aa99e6b 7428 while (off-- && s < lim) {
ffc61ed2 7429 /* XXX could check well-formedness here */
a0ed51b3 7430 s += UTF8SKIP(s);
ffc61ed2 7431 }
a0ed51b3
LW
7432 }
7433 else {
1de06328
YO
7434 while (off++ && s > lim) {
7435 s--;
7436 if (UTF8_IS_CONTINUED(*s)) {
7437 while (s > lim && UTF8_IS_CONTINUATION(*s))
7438 s--;
a0ed51b3 7439 }
1de06328 7440 /* XXX could check well-formedness here */
a0ed51b3
LW
7441 }
7442 }
7443 return s;
7444}
161b471a 7445
f9f4320a
YO
7446#ifdef XXX_dmq
7447/* there are a bunch of places where we use two reghop3's that should
7448 be replaced with this routine. but since thats not done yet
7449 we ifdef it out - dmq
7450*/
dfe13c55 7451STATIC U8 *
1de06328
YO
7452S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7453{
7454 dVAR;
7918f24d
NC
7455
7456 PERL_ARGS_ASSERT_REGHOP4;
7457
1de06328
YO
7458 if (off >= 0) {
7459 while (off-- && s < rlim) {
7460 /* XXX could check well-formedness here */
7461 s += UTF8SKIP(s);
7462 }
7463 }
7464 else {
7465 while (off++ && s > llim) {
7466 s--;
7467 if (UTF8_IS_CONTINUED(*s)) {
7468 while (s > llim && UTF8_IS_CONTINUATION(*s))
7469 s--;
7470 }
7471 /* XXX could check well-formedness here */
7472 }
7473 }
7474 return s;
7475}
f9f4320a 7476#endif
1de06328
YO
7477
7478STATIC U8 *
0ce71af7 7479S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7480{
97aff369 7481 dVAR;
7918f24d
NC
7482
7483 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7484
a0ed51b3 7485 if (off >= 0) {
1aa99e6b 7486 while (off-- && s < lim) {
ffc61ed2 7487 /* XXX could check well-formedness here */
a0ed51b3 7488 s += UTF8SKIP(s);
ffc61ed2 7489 }
a0ed51b3 7490 if (off >= 0)
3dab1dad 7491 return NULL;
a0ed51b3
LW
7492 }
7493 else {
1de06328
YO
7494 while (off++ && s > lim) {
7495 s--;
7496 if (UTF8_IS_CONTINUED(*s)) {
7497 while (s > lim && UTF8_IS_CONTINUATION(*s))
7498 s--;
a0ed51b3 7499 }
1de06328 7500 /* XXX could check well-formedness here */
a0ed51b3
LW
7501 }
7502 if (off <= 0)
3dab1dad 7503 return NULL;
a0ed51b3
LW
7504 }
7505 return s;
7506}
51371543 7507
51371543 7508static void
acfe0abc 7509restore_pos(pTHX_ void *arg)
51371543 7510{
97aff369 7511 dVAR;
097eb12c 7512 regexp * const rex = (regexp *)arg;
ed301438 7513 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7514 if (PL_reg_oldsaved) {
4f639d21
DM
7515 rex->subbeg = PL_reg_oldsaved;
7516 rex->sublen = PL_reg_oldsavedlen;
6502e081
DM
7517 rex->suboffset = PL_reg_oldsavedoffset;
7518 rex->subcoffset = PL_reg_oldsavedcoffset;
db2c6cb3 7519#ifdef PERL_ANY_COW
4f639d21 7520 rex->saved_copy = PL_nrs;
ed252734 7521#endif
07bc277f 7522 RXp_MATCH_COPIED_on(rex);
51371543
GS
7523 }
7524 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7525 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7526 PL_curpm = PL_reg_oldcurpm;
7527 }
7528}
33b8afdf
JH
7529
7530STATIC void
5aaab254 7531S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 7532{
7e0d5ad7
KW
7533 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7534 * on the converted value */
7535
a1cac82e 7536 int i = 1;
7918f24d
NC
7537
7538 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7539
a1cac82e
NC
7540 do {
7541 if (prog->substrs->data[i].substr
7542 && !prog->substrs->data[i].utf8_substr) {
7543 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7544 prog->substrs->data[i].utf8_substr = sv;
7545 sv_utf8_upgrade(sv);
610460f9 7546 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7547 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7548 /* Trim the trailing \n that fbm_compile added last
7549 time. */
7550 SvCUR_set(sv, SvCUR(sv) - 1);
7551 /* Whilst this makes the SV technically "invalid" (as its
7552 buffer is no longer followed by "\0") when fbm_compile()
7553 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7554 fbm_compile(sv, FBMcf_TAIL);
7555 } else
7556 fbm_compile(sv, 0);
610460f9 7557 }
a1cac82e
NC
7558 if (prog->substrs->data[i].substr == prog->check_substr)
7559 prog->check_utf8 = sv;
7560 }
7561 } while (i--);
33b8afdf
JH
7562}
7563
7e0d5ad7 7564STATIC bool
5aaab254 7565S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 7566{
7e0d5ad7
KW
7567 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7568 * on the converted value; returns FALSE if can't be converted. */
7569
97aff369 7570 dVAR;
a1cac82e 7571 int i = 1;
7918f24d
NC
7572
7573 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7574
a1cac82e
NC
7575 do {
7576 if (prog->substrs->data[i].utf8_substr
7577 && !prog->substrs->data[i].substr) {
7578 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
7579 if (! sv_utf8_downgrade(sv, TRUE)) {
7580 return FALSE;
7581 }
5400f398
KW
7582 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7583 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7584 /* Trim the trailing \n that fbm_compile added last
7585 time. */
7586 SvCUR_set(sv, SvCUR(sv) - 1);
7587 fbm_compile(sv, FBMcf_TAIL);
7588 } else
7589 fbm_compile(sv, 0);
7590 }
a1cac82e
NC
7591 prog->substrs->data[i].substr = sv;
7592 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7593 prog->check_substr = sv;
33b8afdf 7594 }
a1cac82e 7595 } while (i--);
7e0d5ad7
KW
7596
7597 return TRUE;
33b8afdf 7598}
66610fdd
RGS
7599
7600/*
7601 * Local variables:
7602 * c-indentation-style: bsd
7603 * c-basic-offset: 4
14d04a33 7604 * indent-tabs-mode: nil
66610fdd
RGS
7605 * End:
7606 *
14d04a33 7607 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7608 */