This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta: combine, merge, and edit "modules updated"
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
c74f6de9
KW
96#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
e0193e47 102/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
635cd5d4 105#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 106 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
f2ed9b32 112#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 113#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 114
3dab1dad
YO
115#define HOPc(pos,off) \
116 (char *)(PL_reg_match_utf8 \
52657f30 117 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
118 : (U8*)(pos + off))
119#define HOPBACKc(pos, off) \
07be1b83
YO
120 (char*)(PL_reg_match_utf8\
121 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
122 : (pos - off >= PL_bostr) \
8e11feef 123 ? (U8*)pos - off \
3dab1dad 124 : NULL)
efb30f32 125
e7409c1b 126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 127#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 128
7016d6eb
DM
129
130#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
131#define NEXTCHR_IS_EOS (nextchr < 0)
132
133#define SET_nextchr \
134 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
135
136#define SET_locinput(p) \
137 locinput = (p); \
138 SET_nextchr
139
140
c7304fe2
KW
141#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
142 if (!swash_ptr) { \
143 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2
KW
144 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
145 1, 0, NULL, &flags); \
146 assert(swash_ptr); \
147 } \
148 } STMT_END
149
150/* If in debug mode, we test that a known character properly matches */
151#ifdef DEBUGGING
152# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
153 property_name, \
154 utf8_char_in_property) \
155 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
156 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
157#else
158# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
159 property_name, \
160 utf8_char_in_property) \
161 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
162#endif
d1eb3177 163
c7304fe2
KW
164#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
165 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
166 swash_property_names[_CC_WORDCHAR], \
167 GREEK_SMALL_LETTER_IOTA_UTF8)
168
169#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
170 STMT_START { \
171 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
172 "_X_regular_begin", \
173 GREEK_SMALL_LETTER_IOTA_UTF8); \
174 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
175 "_X_extend", \
176 COMBINING_GRAVE_ACCENT_UTF8); \
177 } STMT_END
d1eb3177 178
c7304fe2 179#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
180/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
181
5f80c4cf 182/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
183/* it would be nice to rework regcomp.sym to generate this stuff. sigh
184 *
185 * NOTE that *nothing* that affects backtracking should be in here, specifically
186 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
187 * node that is in between two EXACT like nodes when ascertaining what the required
188 * "follow" character is. This should probably be moved to regex compile time
189 * although it may be done at run time beause of the REF possibility - more
190 * investigation required. -- demerphq
191*/
3e901dc0
YO
192#define JUMPABLE(rn) ( \
193 OP(rn) == OPEN || \
194 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
195 OP(rn) == EVAL || \
cca55fe3
JP
196 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
197 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 198 OP(rn) == KEEPS || \
3dab1dad 199 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 200)
ee9b8eae 201#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 202
ee9b8eae
YO
203#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
204
205#if 0
206/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
207 we don't need this definition. */
208#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 209#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
210#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
211
212#else
213/* ... so we use this as its faster. */
214#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 215#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
216#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
217#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
218
219#endif
e2d8ce26 220
a84d97b6
HS
221/*
222 Search for mandatory following text node; for lookahead, the text must
223 follow but for lookbehind (rn->flags != 0) we skip to the next step.
224*/
cca55fe3 225#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
226 while (JUMPABLE(rn)) { \
227 const OPCODE type = OP(rn); \
228 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 229 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 230 else if (type == PLUS) \
cca55fe3 231 rn = NEXTOPER(rn); \
3dab1dad 232 else if (type == IFMATCH) \
a84d97b6 233 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 234 else rn += NEXT_OFF(rn); \
3dab1dad 235 } \
5f80c4cf 236} STMT_END
74750237 237
22913b96
KW
238/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
239 * These are for the pre-composed Hangul syllables, which are all in a
240 * contiguous block and arranged there in such a way so as to facilitate
241 * alorithmic determination of their characteristics. As such, they don't need
242 * a swash, but can be determined by simple arithmetic. Almost all are
243 * GCB=LVT, but every 28th one is a GCB=LV */
244#define SBASE 0xAC00 /* Start of block */
245#define SCount 11172 /* Length of block */
246#define TCount 28
c476f425 247
acfe0abc 248static void restore_pos(pTHX_ void *arg);
51371543 249
87c0511b 250#define REGCP_PAREN_ELEMS 3
f067efbf 251#define REGCP_OTHER_ELEMS 3
e0fa7e2b 252#define REGCP_FRAME_ELEMS 1
620d5b66
NC
253/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
254 * are needed for the regexp context stack bookkeeping. */
255
76e3520e 256STATIC CHECKPOINT
92da3157 257S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 258{
97aff369 259 dVAR;
a3b680e6 260 const int retval = PL_savestack_ix;
92da3157
DM
261 const int paren_elems_to_push =
262 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
263 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
264 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 265 I32 p;
40a82448 266 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 267
b93070ed
DM
268 PERL_ARGS_ASSERT_REGCPPUSH;
269
e49a9654 270 if (paren_elems_to_push < 0)
5637ef5b
NC
271 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
272 paren_elems_to_push);
e49a9654 273
e0fa7e2b
NC
274 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
275 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 276 " out of range (%lu-%ld)",
92da3157
DM
277 total_elems,
278 (unsigned long)maxopenparen,
279 (long)parenfloor);
e0fa7e2b 280
620d5b66 281 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 282
495f47a5 283 DEBUG_BUFFERS_r(
92da3157 284 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
285 PerlIO_printf(Perl_debug_log,
286 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
287 PTR2UV(rex),
288 PTR2UV(rex->offs)
289 );
290 );
92da3157 291 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 292/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
293 SSPUSHINT(rex->offs[p].end);
294 SSPUSHINT(rex->offs[p].start);
1ca2007e 295 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 296 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
297 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
298 (UV)p,
299 (IV)rex->offs[p].start,
300 (IV)rex->offs[p].start_tmp,
301 (IV)rex->offs[p].end
40a82448 302 ));
a0d0e21e 303 }
b1ce53c5 304/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 305 SSPUSHINT(maxopenparen);
b93070ed
DM
306 SSPUSHINT(rex->lastparen);
307 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 308 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 309
a0d0e21e
LW
310 return retval;
311}
312
c277df42 313/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
314#define REGCP_SET(cp) \
315 DEBUG_STATE_r( \
ab3bbdeb 316 PerlIO_printf(Perl_debug_log, \
e4f74956 317 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
318 (IV)PL_savestack_ix)); \
319 cp = PL_savestack_ix
c3464db5 320
ab3bbdeb 321#define REGCP_UNWIND(cp) \
e4f74956 322 DEBUG_STATE_r( \
ab3bbdeb 323 if (cp != PL_savestack_ix) \
e4f74956
YO
324 PerlIO_printf(Perl_debug_log, \
325 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
326 (IV)(cp), (IV)PL_savestack_ix)); \
327 regcpblow(cp)
c277df42 328
a8d1f4b4
DM
329#define UNWIND_PAREN(lp, lcp) \
330 for (n = rex->lastparen; n > lp; n--) \
331 rex->offs[n].end = -1; \
332 rex->lastparen = n; \
333 rex->lastcloseparen = lcp;
334
335
f067efbf 336STATIC void
92da3157 337S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 338{
97aff369 339 dVAR;
e0fa7e2b 340 UV i;
87c0511b 341 U32 paren;
a3621e74
YO
342 GET_RE_DEBUG_FLAGS_DECL;
343
7918f24d
NC
344 PERL_ARGS_ASSERT_REGCPPOP;
345
b1ce53c5 346 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 347 i = SSPOPUV;
e0fa7e2b
NC
348 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
349 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
350 rex->lastcloseparen = SSPOPINT;
351 rex->lastparen = SSPOPINT;
92da3157 352 *maxopenparen_p = SSPOPINT;
b1ce53c5 353
620d5b66 354 i -= REGCP_OTHER_ELEMS;
b1ce53c5 355 /* Now restore the parentheses context. */
495f47a5
DM
356 DEBUG_BUFFERS_r(
357 if (i || rex->lastparen + 1 <= rex->nparens)
358 PerlIO_printf(Perl_debug_log,
359 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
360 PTR2UV(rex),
361 PTR2UV(rex->offs)
362 );
363 );
92da3157 364 paren = *maxopenparen_p;
620d5b66 365 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 366 I32 tmps;
1ca2007e 367 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 368 rex->offs[paren].start = SSPOPINT;
cf93c79d 369 tmps = SSPOPINT;
b93070ed
DM
370 if (paren <= rex->lastparen)
371 rex->offs[paren].end = tmps;
495f47a5
DM
372 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
373 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
374 (UV)paren,
375 (IV)rex->offs[paren].start,
376 (IV)rex->offs[paren].start_tmp,
377 (IV)rex->offs[paren].end,
378 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 379 );
87c0511b 380 paren--;
a0d0e21e 381 }
daf18116 382#if 1
dafc8851
JH
383 /* It would seem that the similar code in regtry()
384 * already takes care of this, and in fact it is in
385 * a better location to since this code can #if 0-ed out
386 * but the code in regtry() is needed or otherwise tests
387 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
388 * (as of patchlevel 7877) will fail. Then again,
389 * this code seems to be necessary or otherwise
225593e1
DM
390 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
391 * --jhi updated by dapm */
b93070ed 392 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 393 if (i > *maxopenparen_p)
b93070ed
DM
394 rex->offs[i].start = -1;
395 rex->offs[i].end = -1;
495f47a5
DM
396 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
397 " \\%"UVuf": %s ..-1 undeffing\n",
398 (UV)i,
92da3157 399 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 400 ));
a0d0e21e 401 }
dafc8851 402#endif
a0d0e21e
LW
403}
404
74088413
DM
405/* restore the parens and associated vars at savestack position ix,
406 * but without popping the stack */
407
408STATIC void
92da3157 409S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
410{
411 I32 tmpix = PL_savestack_ix;
412 PL_savestack_ix = ix;
92da3157 413 regcppop(rex, maxopenparen_p);
74088413
DM
414 PL_savestack_ix = tmpix;
415}
416
02db2b7b 417#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 418
31c7f561
KW
419STATIC bool
420S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
421{
422 /* Returns a boolean as to whether or not 'character' is a member of the
423 * Posix character class given by 'classnum' that should be equivalent to a
424 * value in the typedef '_char_class_number'.
425 *
426 * Ideally this could be replaced by a just an array of function pointers
427 * to the C library functions that implement the macros this calls.
428 * However, to compile, the precise function signatures are required, and
429 * these may vary from platform to to platform. To avoid having to figure
430 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
431 * which adds an extra layer of function call overhead (unless the C
432 * optimizer strips it away). But we don't particularly care about
433 * performance with locales anyway. */
31c7f561
KW
434
435 switch ((_char_class_number) classnum) {
15861f94 436 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 437 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
438 case _CC_ENUM_ASCII: return isASCII_LC(character);
439 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
440 case _CC_ENUM_CASED: return isLOWER_LC(character)
441 || isUPPER_LC(character);
e8d596e0 442 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
443 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
444 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
445 case _CC_ENUM_LOWER: return isLOWER_LC(character);
446 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 447 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 448 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 449 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
450 case _CC_ENUM_UPPER: return isUPPER_LC(character);
451 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 452 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
453 default: /* VERTSPACE should never occur in locales */
454 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
455 }
456
457 assert(0); /* NOTREACHED */
458 return FALSE;
459}
460
3018b823
KW
461STATIC bool
462S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
463{
464 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
465 * 'character' is a member of the Posix character class given by 'classnum'
466 * that should be equivalent to a value in the typedef
467 * '_char_class_number'.
468 *
469 * This just calls isFOO_lc on the code point for the character if it is in
470 * the range 0-255. Outside that range, all characters avoid Unicode
471 * rules, ignoring any locale. So use the Unicode function if this class
472 * requires a swash, and use the Unicode macro otherwise. */
473
474 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
475
476 if (UTF8_IS_INVARIANT(*character)) {
477 return isFOO_lc(classnum, *character);
478 }
479 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
480 return isFOO_lc(classnum,
481 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
482 }
483
484 if (classnum < _FIRST_NON_SWASH_CC) {
485
486 /* Initialize the swash unless done already */
487 if (! PL_utf8_swash_ptrs[classnum]) {
488 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
489 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
490 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
491 }
492
92a2046b
KW
493 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
494 character,
495 TRUE /* is UTF */ ));
3018b823
KW
496 }
497
498 switch ((_char_class_number) classnum) {
499 case _CC_ENUM_SPACE:
500 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
501
502 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
503 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
504 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
505 default: return 0; /* Things like CNTRL are always
506 below 256 */
507 }
508
509 assert(0); /* NOTREACHED */
510 return FALSE;
511}
512
a687059c 513/*
e50aee73 514 * pregexec and friends
a687059c
LW
515 */
516
76234dfb 517#ifndef PERL_IN_XSUB_RE
a687059c 518/*
c277df42 519 - pregexec - match a regexp against a string
a687059c 520 */
c277df42 521I32
5aaab254 522Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 523 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
524/* stringarg: the point in the string at which to begin matching */
525/* strend: pointer to null at end of string */
526/* strbeg: real beginning of string */
527/* minend: end of match must be >= minend bytes after stringarg. */
528/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
529 * itself is accessed via the pointers above */
530/* nosave: For optimizations. */
c277df42 531{
7918f24d
NC
532 PERL_ARGS_ASSERT_PREGEXEC;
533
c277df42 534 return
9041c2e3 535 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
536 nosave ? 0 : REXEC_COPY_STR);
537}
76234dfb 538#endif
22e551b9 539
9041c2e3 540/*
cad2e5aa
JH
541 * Need to implement the following flags for reg_anch:
542 *
543 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
544 * USE_INTUIT_ML
545 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
546 * INTUIT_AUTORITATIVE_ML
547 * INTUIT_ONCE_NOML - Intuit can match in one location only.
548 * INTUIT_ONCE_ML
549 *
550 * Another flag for this function: SECOND_TIME (so that float substrs
551 * with giant delta may be not rechecked).
552 */
553
554/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
555
3f7c398e 556/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
557 Otherwise, only SvCUR(sv) is used to get strbeg. */
558
559/* XXXX We assume that strpos is strbeg unless sv. */
560
6eb5f6b9
JH
561/* XXXX Some places assume that there is a fixed substring.
562 An update may be needed if optimizer marks as "INTUITable"
563 RExen without fixed substrings. Similarly, it is assumed that
564 lengths of all the strings are no more than minlen, thus they
565 cannot come from lookahead.
40d049e4
YO
566 (Or minlen should take into account lookahead.)
567 NOTE: Some of this comment is not correct. minlen does now take account
568 of lookahead/behind. Further research is required. -- demerphq
569
570*/
6eb5f6b9 571
2c2d71f5
JH
572/* A failure to find a constant substring means that there is no need to make
573 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 574 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
575 regtry() should be needed.
576
577 REx compiler's optimizer found 4 possible hints:
578 a) Anchored substring;
579 b) Fixed substring;
580 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 581 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 582 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
583 string which does not contradict any of them.
584 */
2c2d71f5 585
6eb5f6b9
JH
586/* Most of decisions we do here should have been done at compile time.
587 The nodes of the REx which we used for the search should have been
588 deleted from the finite automaton. */
589
cad2e5aa 590char *
288b8c02 591Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 592 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 593{
97aff369 594 dVAR;
8d919b0a 595 struct regexp *const prog = ReANY(rx);
eb578fdb 596 I32 start_shift = 0;
cad2e5aa 597 /* Should be nonnegative! */
eb578fdb
KW
598 I32 end_shift = 0;
599 char *s;
600 SV *check;
a1933d95 601 char *strbeg;
cad2e5aa 602 char *t;
f2ed9b32 603 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 604 I32 ml_anch;
eb578fdb 605 char *other_last = NULL; /* other substr checked before this */
bd61b366 606 char *check_at = NULL; /* check substr found at this pos */
d8080198 607 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 608 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 609 RXi_GET_DECL(prog,progi);
984e6dd1 610 bool is_utf8_pat;
30944b6d 611#ifdef DEBUGGING
890ce7af 612 const char * const i_strpos = strpos;
30944b6d 613#endif
a3621e74
YO
614 GET_RE_DEBUG_FLAGS_DECL;
615
7918f24d 616 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
617 PERL_UNUSED_ARG(flags);
618 PERL_UNUSED_ARG(data);
7918f24d 619
f2ed9b32 620 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 621
984e6dd1 622 is_utf8_pat = cBOOL(RX_UTF8(rx));
4fab19ce 623
ab3bbdeb 624 DEBUG_EXECUTE_r(
f2ed9b32 625 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
626 sv ? "Guessing start of match in sv for"
627 : "Guessing start of match in string for");
2a782b5b 628 );
cad2e5aa 629
c344f387
JH
630 /* CHR_DIST() would be more correct here but it makes things slow. */
631 if (prog->minlen > strend - strpos) {
a3621e74 632 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 633 "String too short... [re_intuit_start]\n"));
cad2e5aa 634 goto fail;
2c2d71f5 635 }
d8da0584 636
7016d6eb
DM
637 /* XXX we need to pass strbeg as a separate arg: the following is
638 * guesswork and can be wrong... */
639 if (sv && SvPOK(sv)) {
640 char * p = SvPVX(sv);
641 STRLEN cur = SvCUR(sv);
642 if (p <= strpos && strpos < p + cur) {
643 strbeg = p;
644 assert(p <= strend && strend <= p + cur);
645 }
646 else
647 strbeg = strend - cur;
648 }
649 else
650 strbeg = strpos;
651
1aa99e6b 652 PL_regeol = strend;
f2ed9b32 653 if (utf8_target) {
33b8afdf
JH
654 if (!prog->check_utf8 && prog->check_substr)
655 to_utf8_substr(prog);
656 check = prog->check_utf8;
657 } else {
7e0d5ad7
KW
658 if (!prog->check_substr && prog->check_utf8) {
659 if (! to_byte_substr(prog)) {
6b54ddc5 660 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
661 }
662 }
33b8afdf
JH
663 check = prog->check_substr;
664 }
bbe252da
YO
665 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
666 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
667 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 668 && !multiline ) ); /* Check after \n? */
cad2e5aa 669
7e25d62c 670 if (!ml_anch) {
bbe252da
YO
671 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
672 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 673 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
674 && sv && !SvROK(sv)
675 && (strpos != strbeg)) {
a3621e74 676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
677 goto fail;
678 }
d46b78c6
KW
679 if (prog->check_offset_min == prog->check_offset_max
680 && !(prog->extflags & RXf_CANY_SEEN)
681 && ! multiline) /* /m can cause \n's to match that aren't
682 accounted for in the string max length.
683 See [perl #115242] */
684 {
2c2d71f5 685 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
686 I32 slen;
687
1aa99e6b 688 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 689
653099ff
GS
690 if (SvTAIL(check)) {
691 slen = SvCUR(check); /* >= 1 */
cad2e5aa 692
9041c2e3 693 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 694 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 695 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 696 goto fail_finish;
cad2e5aa
JH
697 }
698 /* Now should match s[0..slen-2] */
699 slen--;
3f7c398e 700 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 701 || (slen > 1
3f7c398e 702 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 703 report_neq:
a3621e74 704 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
705 goto fail_finish;
706 }
cad2e5aa 707 }
3f7c398e 708 else if (*SvPVX_const(check) != *s
653099ff 709 || ((slen = SvCUR(check)) > 1
3f7c398e 710 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 711 goto report_neq;
c315bfe8 712 check_at = s;
2c2d71f5 713 goto success_at_start;
7e25d62c 714 }
cad2e5aa 715 }
2c2d71f5 716 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 717 s = strpos;
2c2d71f5 718 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
719 end_shift = prog->check_end_shift;
720
2c2d71f5 721 if (!ml_anch) {
a3b680e6 722 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 723 - (SvTAIL(check) != 0);
a3b680e6 724 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
725
726 if (end_shift < eshift)
727 end_shift = eshift;
728 }
cad2e5aa 729 }
2c2d71f5 730 else { /* Can match at random position */
cad2e5aa
JH
731 ml_anch = 0;
732 s = strpos;
1de06328
YO
733 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
734 end_shift = prog->check_end_shift;
735
736 /* end shift should be non negative here */
cad2e5aa
JH
737 }
738
bcdf7404 739#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 740 if (end_shift < 0)
1de06328 741 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 742 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
743#endif
744
2c2d71f5
JH
745 restart:
746 /* Find a possible match in the region s..strend by looking for
747 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
748
749 {
750 I32 srch_start_shift = start_shift;
751 I32 srch_end_shift = end_shift;
c33e64f0
FC
752 U8* start_point;
753 U8* end_point;
1de06328
YO
754 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
755 srch_end_shift -= ((strbeg - s) - srch_start_shift);
756 srch_start_shift = strbeg - s;
757 }
6bda09f9 758 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
759 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
760 (IV)prog->check_offset_min,
761 (IV)srch_start_shift,
762 (IV)srch_end_shift,
763 (IV)prog->check_end_shift);
764 });
765
bbe252da 766 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
767 start_point= (U8*)(s + srch_start_shift);
768 end_point= (U8*)(strend - srch_end_shift);
769 } else {
770 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
771 end_point= HOP3(strend, -srch_end_shift, strbeg);
772 }
6bda09f9 773 DEBUG_OPTIMISE_MORE_r({
56570a2c 774 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 775 (int)(end_point - start_point),
fc8cd66c 776 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
777 start_point);
778 });
779
780 s = fbm_instr( start_point, end_point,
7fba1cd6 781 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 782 }
cad2e5aa
JH
783 /* Update the count-of-usability, remove useless subpatterns,
784 unshift s. */
2c2d71f5 785
ab3bbdeb 786 DEBUG_EXECUTE_r({
f2ed9b32 787 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
788 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
789 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 790 (s ? "Found" : "Did not find"),
f2ed9b32 791 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
792 ? "anchored" : "floating"),
793 quoted,
794 RE_SV_TAIL(check),
795 (s ? " at offset " : "...\n") );
796 });
2c2d71f5
JH
797
798 if (!s)
799 goto fail_finish;
2c2d71f5 800 /* Finish the diagnostic message */
a3621e74 801 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 802
1de06328
YO
803 /* XXX dmq: first branch is for positive lookbehind...
804 Our check string is offset from the beginning of the pattern.
805 So we need to do any stclass tests offset forward from that
806 point. I think. :-(
807 */
808
809
810
811 check_at=s;
812
813
2c2d71f5
JH
814 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
815 Start with the other substr.
816 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 817 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
818 *always* match. Probably should be marked during compile...
819 Probably it is right to do no SCREAM here...
820 */
821
f2ed9b32 822 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
823 : (prog->float_substr && prog->anchored_substr))
824 {
30944b6d 825 /* Take into account the "other" substring. */
2c2d71f5
JH
826 /* XXXX May be hopelessly wrong for UTF... */
827 if (!other_last)
6eb5f6b9 828 other_last = strpos;
f2ed9b32 829 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
830 do_other_anchored:
831 {
890ce7af
AL
832 char * const last = HOP3c(s, -start_shift, strbeg);
833 char *last1, *last2;
be8e71aa 834 char * const saved_s = s;
33b8afdf 835 SV* must;
2c2d71f5 836
2c2d71f5
JH
837 t = s - prog->check_offset_max;
838 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 839 && (!utf8_target
0ce71af7 840 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 841 && t > strpos)))
6f207bd3 842 NOOP;
2c2d71f5
JH
843 else
844 t = strpos;
1aa99e6b 845 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
846 if (t < other_last) /* These positions already checked */
847 t = other_last;
1aa99e6b 848 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
849 if (last < last1)
850 last1 = last;
1de06328
YO
851 /* XXXX It is not documented what units *_offsets are in.
852 We assume bytes, but this is clearly wrong.
853 Meaning this code needs to be carefully reviewed for errors.
854 dmq.
855 */
856
2c2d71f5 857 /* On end-of-str: see comment below. */
f2ed9b32 858 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
859 if (must == &PL_sv_undef) {
860 s = (char*)NULL;
1de06328 861 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
862 }
863 else
864 s = fbm_instr(
865 (unsigned char*)t,
866 HOP3(HOP3(last1, prog->anchored_offset, strend)
867 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
868 must,
7fba1cd6 869 multiline ? FBMrf_MULTILINE : 0
33b8afdf 870 );
ab3bbdeb 871 DEBUG_EXECUTE_r({
f2ed9b32 872 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
873 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
874 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 875 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
876 quoted, RE_SV_TAIL(must));
877 });
878
879
2c2d71f5
JH
880 if (!s) {
881 if (last1 >= last2) {
a3621e74 882 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
883 ", giving up...\n"));
884 goto fail_finish;
885 }
a3621e74 886 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 887 ", trying floating at offset %ld...\n",
be8e71aa 888 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
889 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
890 s = HOP3c(last, 1, strend);
2c2d71f5
JH
891 goto restart;
892 }
893 else {
a3621e74 894 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 895 (long)(s - i_strpos)));
1aa99e6b
IH
896 t = HOP3c(s, -prog->anchored_offset, strbeg);
897 other_last = HOP3c(s, 1, strend);
be8e71aa 898 s = saved_s;
2c2d71f5
JH
899 if (t == strpos)
900 goto try_at_start;
2c2d71f5
JH
901 goto try_at_offset;
902 }
30944b6d 903 }
2c2d71f5
JH
904 }
905 else { /* Take into account the floating substring. */
33b8afdf 906 char *last, *last1;
be8e71aa 907 char * const saved_s = s;
33b8afdf
JH
908 SV* must;
909
910 t = HOP3c(s, -start_shift, strbeg);
911 last1 = last =
912 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
913 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
914 last = HOP3c(t, prog->float_max_offset, strend);
915 s = HOP3c(t, prog->float_min_offset, strend);
916 if (s < other_last)
917 s = other_last;
2c2d71f5 918 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 919 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
920 /* fbm_instr() takes into account exact value of end-of-str
921 if the check is SvTAIL(ed). Since false positives are OK,
922 and end-of-str is not later than strend we are OK. */
923 if (must == &PL_sv_undef) {
924 s = (char*)NULL;
1de06328 925 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
926 }
927 else
2c2d71f5 928 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
929 (unsigned char*)last + SvCUR(must)
930 - (SvTAIL(must)!=0),
7fba1cd6 931 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 932 DEBUG_EXECUTE_r({
f2ed9b32 933 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
934 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
935 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 936 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
937 quoted, RE_SV_TAIL(must));
938 });
33b8afdf
JH
939 if (!s) {
940 if (last1 == last) {
a3621e74 941 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
942 ", giving up...\n"));
943 goto fail_finish;
2c2d71f5 944 }
a3621e74 945 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 946 ", trying anchored starting at offset %ld...\n",
be8e71aa 947 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
948 other_last = last;
949 s = HOP3c(t, 1, strend);
950 goto restart;
951 }
952 else {
a3621e74 953 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
954 (long)(s - i_strpos)));
955 other_last = s; /* Fix this later. --Hugo */
be8e71aa 956 s = saved_s;
33b8afdf
JH
957 if (t == strpos)
958 goto try_at_start;
959 goto try_at_offset;
960 }
2c2d71f5 961 }
cad2e5aa 962 }
2c2d71f5 963
1de06328 964
9ef43ace 965 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 966
6bda09f9 967 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
968 PerlIO_printf(Perl_debug_log,
969 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
970 (IV)prog->check_offset_min,
971 (IV)prog->check_offset_max,
972 (IV)(s-strpos),
973 (IV)(t-strpos),
974 (IV)(t-s),
975 (IV)(strend-strpos)
976 )
977 );
978
2c2d71f5 979 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 980 && (!utf8_target
9ef43ace 981 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
982 && t > strpos)))
983 {
2c2d71f5
JH
984 /* Fixed substring is found far enough so that the match
985 cannot start at strpos. */
986 try_at_offset:
cad2e5aa 987 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
988 /* Eventually fbm_*() should handle this, but often
989 anchored_offset is not 0, so this check will not be wasted. */
990 /* XXXX In the code below we prefer to look for "^" even in
991 presence of anchored substrings. And we search even
992 beyond the found float position. These pessimizations
993 are historical artefacts only. */
994 find_anchor:
2c2d71f5 995 while (t < strend - prog->minlen) {
cad2e5aa 996 if (*t == '\n') {
4ee3650e 997 if (t < check_at - prog->check_offset_min) {
f2ed9b32 998 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
999 /* Since we moved from the found position,
1000 we definitely contradict the found anchored
30944b6d
IZ
1001 substr. Due to the above check we do not
1002 contradict "check" substr.
1003 Thus we can arrive here only if check substr
1004 is float. Redo checking for "other"=="fixed".
1005 */
9041c2e3 1006 strpos = t + 1;
a3621e74 1007 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1008 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1009 goto do_other_anchored;
1010 }
4ee3650e
GS
1011 /* We don't contradict the found floating substring. */
1012 /* XXXX Why not check for STCLASS? */
cad2e5aa 1013 s = t + 1;
a3621e74 1014 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1015 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1016 goto set_useful;
1017 }
4ee3650e
GS
1018 /* Position contradicts check-string */
1019 /* XXXX probably better to look for check-string
1020 than for "\n", so one should lower the limit for t? */
a3621e74 1021 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1022 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1023 other_last = strpos = s = t + 1;
cad2e5aa
JH
1024 goto restart;
1025 }
1026 t++;
1027 }
a3621e74 1028 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1029 PL_colors[0], PL_colors[1]));
2c2d71f5 1030 goto fail_finish;
cad2e5aa 1031 }
f5952150 1032 else {
a3621e74 1033 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1034 PL_colors[0], PL_colors[1]));
f5952150 1035 }
cad2e5aa
JH
1036 s = t;
1037 set_useful:
f2ed9b32 1038 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1039 }
1040 else {
f5952150 1041 /* The found string does not prohibit matching at strpos,
2c2d71f5 1042 - no optimization of calling REx engine can be performed,
f5952150
GS
1043 unless it was an MBOL and we are not after MBOL,
1044 or a future STCLASS check will fail this. */
2c2d71f5
JH
1045 try_at_start:
1046 /* Even in this situation we may use MBOL flag if strpos is offset
1047 wrt the start of the string. */
05b4157f 1048 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1049 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1050 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1051 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1052 {
cad2e5aa
JH
1053 t = strpos;
1054 goto find_anchor;
1055 }
a3621e74 1056 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1057 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1058 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1059 );
2c2d71f5 1060 success_at_start:
bbe252da 1061 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1062 && (utf8_target ? (
33b8afdf
JH
1063 prog->check_utf8 /* Could be deleted already */
1064 && --BmUSEFUL(prog->check_utf8) < 0
1065 && (prog->check_utf8 == prog->float_utf8)
1066 ) : (
1067 prog->check_substr /* Could be deleted already */
1068 && --BmUSEFUL(prog->check_substr) < 0
1069 && (prog->check_substr == prog->float_substr)
1070 )))
66e933ab 1071 {
cad2e5aa 1072 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1073 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1074 /* XXX Does the destruction order has to change with utf8_target? */
1075 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1076 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1077 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1078 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1079 check = NULL; /* abort */
cad2e5aa 1080 s = strpos;
486ec47a 1081 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1082 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1083 if (prog->intflags & PREGf_IMPLICIT)
1084 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1085 /* XXXX This is a remnant of the old implementation. It
1086 looks wasteful, since now INTUIT can use many
6eb5f6b9 1087 other heuristics. */
bbe252da 1088 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1089 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1090 }
1091 else
1092 s = strpos;
1093 }
1094
6eb5f6b9
JH
1095 /* Last resort... */
1096 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1097 /* trie stclasses are too expensive to use here, we are better off to
1098 leave it to regmatch itself */
f8fc2ecf 1099 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1100 /* minlen == 0 is possible if regstclass is \b or \B,
1101 and the fixed substr is ''$.
1102 Since minlen is already taken into account, s+1 is before strend;
1103 accidentally, minlen >= 1 guaranties no false positives at s + 1
1104 even for \b or \B. But (minlen? 1 : 0) below assumes that
1105 regstclass does not come from lookahead... */
1106 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1107 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1108 const U8* const str = (U8*)STRING(progi->regstclass);
1109 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1110 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1111 : 1);
1de06328
YO
1112 char * endpos;
1113 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1114 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1115 else if (prog->float_substr || prog->float_utf8)
1116 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1117 else
1118 endpos= strend;
1119
d8080198
YO
1120 if (checked_upto < s)
1121 checked_upto = s;
1122 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1123 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1124
6eb5f6b9 1125 t = s;
984e6dd1
DM
1126 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1127 NULL, is_utf8_pat);
d8080198
YO
1128 if (s) {
1129 checked_upto = s;
1130 } else {
6eb5f6b9 1131#ifdef DEBUGGING
cbbf8932 1132 const char *what = NULL;
6eb5f6b9
JH
1133#endif
1134 if (endpos == strend) {
a3621e74 1135 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1136 "Could not match STCLASS...\n") );
1137 goto fail;
1138 }
a3621e74 1139 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1140 "This position contradicts STCLASS...\n") );
bbe252da 1141 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1142 goto fail;
d8080198
YO
1143 checked_upto = HOPBACKc(endpos, start_shift);
1144 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1145 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1146 /* Contradict one of substrings */
33b8afdf 1147 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1148 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1149 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1150 hop_and_restart:
1aa99e6b 1151 s = HOP3c(t, 1, strend);
66e933ab
GS
1152 if (s + start_shift + end_shift > strend) {
1153 /* XXXX Should be taken into account earlier? */
a3621e74 1154 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1155 "Could not match STCLASS...\n") );
1156 goto fail;
1157 }
5e39e1e5
HS
1158 if (!check)
1159 goto giveup;
a3621e74 1160 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1161 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1162 what, (long)(s + start_shift - i_strpos)) );
1163 goto restart;
1164 }
66e933ab 1165 /* Have both, check_string is floating */
6eb5f6b9
JH
1166 if (t + start_shift >= check_at) /* Contradicts floating=check */
1167 goto retry_floating_check;
1168 /* Recheck anchored substring, but not floating... */
9041c2e3 1169 s = check_at;
5e39e1e5
HS
1170 if (!check)
1171 goto giveup;
a3621e74 1172 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1173 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1174 (long)(other_last - i_strpos)) );
1175 goto do_other_anchored;
1176 }
60e71179
GS
1177 /* Another way we could have checked stclass at the
1178 current position only: */
1179 if (ml_anch) {
1180 s = t = t + 1;
5e39e1e5
HS
1181 if (!check)
1182 goto giveup;
a3621e74 1183 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1184 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1185 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1186 goto try_at_offset;
66e933ab 1187 }
f2ed9b32 1188 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1189 goto fail;
486ec47a 1190 /* Check is floating substring. */
6eb5f6b9
JH
1191 retry_floating_check:
1192 t = check_at - start_shift;
a3621e74 1193 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1194 goto hop_and_restart;
1195 }
b7953727 1196 if (t != s) {
a3621e74 1197 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1198 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1199 (long)(t - i_strpos), (long)(s - i_strpos))
1200 );
1201 }
1202 else {
a3621e74 1203 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1204 "Does not contradict STCLASS...\n");
1205 );
1206 }
6eb5f6b9 1207 }
5e39e1e5 1208 giveup:
a3621e74 1209 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1210 PL_colors[4], (check ? "Guessed" : "Giving up"),
1211 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1212 return s;
2c2d71f5
JH
1213
1214 fail_finish: /* Substring not found */
33b8afdf 1215 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1216 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1217 fail:
a3621e74 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1219 PL_colors[4], PL_colors[5]));
bd61b366 1220 return NULL;
cad2e5aa 1221}
9661b544 1222
a0a388a1
YO
1223#define DECL_TRIE_TYPE(scan) \
1224 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1225 trie_type = ((scan->flags == EXACT) \
1226 ? (utf8_target ? trie_utf8 : trie_plain) \
1227 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1228
fd3249ee
YO
1229#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1230STMT_START { \
fab2782b
YO
1231 STRLEN skiplen; \
1232 switch (trie_type) { \
1233 case trie_utf8_fold: \
1234 if ( foldlen>0 ) { \
1235 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1236 foldlen -= len; \
1237 uscan += len; \
1238 len=0; \
1239 } else { \
1240 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1241 len = UTF8SKIP(uc); \
1242 skiplen = UNISKIP( uvc ); \
1243 foldlen -= skiplen; \
1244 uscan = foldbuf + skiplen; \
1245 } \
1246 break; \
1247 case trie_latin_utf8_fold: \
1248 if ( foldlen>0 ) { \
1249 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1250 foldlen -= len; \
1251 uscan += len; \
1252 len=0; \
1253 } else { \
1254 len = 1; \
1255 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1256 skiplen = UNISKIP( uvc ); \
1257 foldlen -= skiplen; \
1258 uscan = foldbuf + skiplen; \
1259 } \
1260 break; \
1261 case trie_utf8: \
1262 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1263 break; \
1264 case trie_plain: \
1265 uvc = (UV)*uc; \
1266 len = 1; \
1267 } \
1268 if (uvc < 256) { \
1269 charid = trie->charmap[ uvc ]; \
1270 } \
1271 else { \
1272 charid = 0; \
1273 if (widecharmap) { \
1274 SV** const svpp = hv_fetch(widecharmap, \
1275 (char*)&uvc, sizeof(UV), 0); \
1276 if (svpp) \
1277 charid = (U16)SvIV(*svpp); \
1278 } \
1279 } \
4cadc6a9
YO
1280} STMT_END
1281
4cadc6a9
YO
1282#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1283STMT_START { \
1284 while (s <= e) { \
1285 if ( (CoNd) \
fac1af77 1286 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1287 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1288 goto got_it; \
1289 s++; \
1290 } \
1291} STMT_END
1292
1293#define REXEC_FBC_UTF8_SCAN(CoDe) \
1294STMT_START { \
9a902117 1295 while (s < strend) { \
4cadc6a9 1296 CoDe \
9a902117 1297 s += UTF8SKIP(s); \
4cadc6a9
YO
1298 } \
1299} STMT_END
1300
1301#define REXEC_FBC_SCAN(CoDe) \
1302STMT_START { \
1303 while (s < strend) { \
1304 CoDe \
1305 s++; \
1306 } \
1307} STMT_END
1308
1309#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1310REXEC_FBC_UTF8_SCAN( \
1311 if (CoNd) { \
7aee35ff 1312 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1313 goto got_it; \
1314 else \
1315 tmp = doevery; \
1316 } \
1317 else \
1318 tmp = 1; \
1319)
1320
1321#define REXEC_FBC_CLASS_SCAN(CoNd) \
1322REXEC_FBC_SCAN( \
1323 if (CoNd) { \
24b23f37 1324 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1325 goto got_it; \
1326 else \
1327 tmp = doevery; \
1328 } \
1329 else \
1330 tmp = 1; \
1331)
1332
1333#define REXEC_FBC_TRYIT \
24b23f37 1334if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1335 goto got_it
1336
e1d1eefb 1337#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1338 if (utf8_target) { \
e1d1eefb
YO
1339 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1340 } \
1341 else { \
1342 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1343 }
e1d1eefb 1344
786e8c11
YO
1345#define DUMP_EXEC_POS(li,s,doutf8) \
1346 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1347
cfaf538b
KW
1348
1349#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1350 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1351 tmp = TEST_NON_UTF8(tmp); \
1352 REXEC_FBC_UTF8_SCAN( \
1353 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1354 tmp = !tmp; \
1355 IF_SUCCESS; \
1356 } \
1357 else { \
1358 IF_FAIL; \
1359 } \
1360 ); \
1361
1362#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1363 if (s == PL_bostr) { \
1364 tmp = '\n'; \
1365 } \
1366 else { \
1367 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1368 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1369 } \
1370 tmp = TeSt1_UtF8; \
1371 LOAD_UTF8_CHARCLASS_ALNUM(); \
1372 REXEC_FBC_UTF8_SCAN( \
1373 if (tmp == ! (TeSt2_UtF8)) { \
1374 tmp = !tmp; \
1375 IF_SUCCESS; \
1376 } \
1377 else { \
1378 IF_FAIL; \
1379 } \
1380 ); \
1381
63ac0dad
KW
1382/* The only difference between the BOUND and NBOUND cases is that
1383 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1384 * NBOUND. This is accomplished by passing it in either the if or else clause,
1385 * with the other one being empty */
1386#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1387 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1388
1389#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1390 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1391
1392#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1393 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1394
1395#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1396 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1397
63ac0dad
KW
1398
1399/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1400 * be passed in completely with the variable name being tested, which isn't
1401 * such a clean interface, but this is easier to read than it was before. We
1402 * are looking for the boundary (or non-boundary between a word and non-word
1403 * character. The utf8 and non-utf8 cases have the same logic, but the details
1404 * must be different. Find the "wordness" of the character just prior to this
1405 * one, and compare it with the wordness of this one. If they differ, we have
1406 * a boundary. At the beginning of the string, pretend that the previous
1407 * character was a new-line */
cfaf538b 1408#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1409 if (utf8_target) { \
cfaf538b 1410 UTF8_CODE \
63ac0dad
KW
1411 } \
1412 else { /* Not utf8 */ \
1413 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1414 tmp = TEST_NON_UTF8(tmp); \
1415 REXEC_FBC_SCAN( \
1416 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1417 tmp = !tmp; \
1418 IF_SUCCESS; \
1419 } \
1420 else { \
1421 IF_FAIL; \
1422 } \
1423 ); \
1424 } \
1425 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1426 goto got_it;
1427
786e8c11
YO
1428/* We know what class REx starts with. Try to find this position... */
1429/* if reginfo is NULL, its a dryrun */
1430/* annoyingly all the vars in this routine have different names from their counterparts
1431 in regmatch. /grrr */
1432
3c3eec57 1433STATIC char *
07be1b83 1434S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
984e6dd1 1435 const char *strend, regmatch_info *reginfo, bool is_utf8_pat)
a687059c 1436{
73104a1b
KW
1437 dVAR;
1438 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1439 char *pat_string; /* The pattern's exactish string */
1440 char *pat_end; /* ptr to end char of pat_string */
1441 re_fold_t folder; /* Function for computing non-utf8 folds */
1442 const U8 *fold_array; /* array for folding ords < 256 */
1443 STRLEN ln;
1444 STRLEN lnc;
73104a1b
KW
1445 U8 c1;
1446 U8 c2;
1447 char *e;
1448 I32 tmp = 1; /* Scratch variable? */
1449 const bool utf8_target = PL_reg_match_utf8;
1450 UV utf8_fold_flags = 0;
3018b823
KW
1451 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1452 with a result inverts that result, as 0^1 =
1453 1 and 1^1 = 0 */
1454 _char_class_number classnum;
1455
73104a1b 1456 RXi_GET_DECL(prog,progi);
2f7f8cb1 1457
73104a1b 1458 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1459
73104a1b
KW
1460 /* We know what class it must start with. */
1461 switch (OP(c)) {
1462 case ANYOF:
9aa1e39f 1463 case ANYOF_SYNTHETIC:
954a2af6 1464 case ANYOF_WARN_SUPER:
73104a1b
KW
1465 if (utf8_target) {
1466 REXEC_FBC_UTF8_CLASS_SCAN(
1467 reginclass(prog, c, (U8*)s, utf8_target));
1468 }
1469 else {
1470 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1471 }
1472 break;
1473 case CANY:
1474 REXEC_FBC_SCAN(
1475 if (tmp && (!reginfo || regtry(reginfo, &s)))
1476 goto got_it;
1477 else
1478 tmp = doevery;
1479 );
1480 break;
1481
1482 case EXACTFA:
984e6dd1 1483 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1484 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1485 goto do_exactf_utf8;
1486 }
1487 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1488 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1489 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1490
73104a1b
KW
1491 case EXACTF:
1492 if (utf8_target) {
16d951b7 1493
73104a1b
KW
1494 /* regcomp.c already folded this if pattern is in UTF-8 */
1495 utf8_fold_flags = 0;
1496 goto do_exactf_utf8;
1497 }
1498 fold_array = PL_fold;
1499 folder = foldEQ;
1500 goto do_exactf_non_utf8;
1501
1502 case EXACTFL:
984e6dd1 1503 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1504 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1505 goto do_exactf_utf8;
1506 }
1507 fold_array = PL_fold_locale;
1508 folder = foldEQ_locale;
1509 goto do_exactf_non_utf8;
3c760661 1510
73104a1b 1511 case EXACTFU_SS:
984e6dd1 1512 if (is_utf8_pat) {
73104a1b
KW
1513 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1514 }
1515 goto do_exactf_utf8;
16d951b7 1516
73104a1b
KW
1517 case EXACTFU_TRICKYFOLD:
1518 case EXACTFU:
984e6dd1
DM
1519 if (is_utf8_pat || utf8_target) {
1520 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1521 goto do_exactf_utf8;
1522 }
fac1af77 1523
73104a1b
KW
1524 /* Any 'ss' in the pattern should have been replaced by regcomp,
1525 * so we don't have to worry here about this single special case
1526 * in the Latin1 range */
1527 fold_array = PL_fold_latin1;
1528 folder = foldEQ_latin1;
1529
1530 /* FALL THROUGH */
1531
1532 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1533 are no glitches with fold-length differences
1534 between the target string and pattern */
1535
1536 /* The idea in the non-utf8 EXACTF* cases is to first find the
1537 * first character of the EXACTF* node and then, if necessary,
1538 * case-insensitively compare the full text of the node. c1 is the
1539 * first character. c2 is its fold. This logic will not work for
1540 * Unicode semantics and the german sharp ss, which hence should
1541 * not be compiled into a node that gets here. */
1542 pat_string = STRING(c);
1543 ln = STR_LEN(c); /* length to match in octets/bytes */
1544
1545 /* We know that we have to match at least 'ln' bytes (which is the
1546 * same as characters, since not utf8). If we have to match 3
1547 * characters, and there are only 2 availabe, we know without
1548 * trying that it will fail; so don't start a match past the
1549 * required minimum number from the far end */
1550 e = HOP3c(strend, -((I32)ln), s);
1551
1552 if (!reginfo && e < s) {
1553 e = s; /* Due to minlen logic of intuit() */
1554 }
fac1af77 1555
73104a1b
KW
1556 c1 = *pat_string;
1557 c2 = fold_array[c1];
1558 if (c1 == c2) { /* If char and fold are the same */
1559 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1560 }
1561 else {
1562 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1563 }
1564 break;
fac1af77 1565
73104a1b
KW
1566 do_exactf_utf8:
1567 {
1568 unsigned expansion;
1569
1570 /* If one of the operands is in utf8, we can't use the simpler folding
1571 * above, due to the fact that many different characters can have the
1572 * same fold, or portion of a fold, or different- length fold */
1573 pat_string = STRING(c);
1574 ln = STR_LEN(c); /* length to match in octets/bytes */
1575 pat_end = pat_string + ln;
984e6dd1 1576 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1577 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1578 : ln;
1579
1580 /* We have 'lnc' characters to match in the pattern, but because of
1581 * multi-character folding, each character in the target can match
1582 * up to 3 characters (Unicode guarantees it will never exceed
1583 * this) if it is utf8-encoded; and up to 2 if not (based on the
1584 * fact that the Latin 1 folds are already determined, and the
1585 * only multi-char fold in that range is the sharp-s folding to
1586 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1587 * string character. Adjust lnc accordingly, rounding up, so that
1588 * if we need to match at least 4+1/3 chars, that really is 5. */
1589 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1590 lnc = (lnc + expansion - 1) / expansion;
1591
1592 /* As in the non-UTF8 case, if we have to match 3 characters, and
1593 * only 2 are left, it's guaranteed to fail, so don't start a
1594 * match that would require us to go beyond the end of the string
1595 */
1596 e = HOP3c(strend, -((I32)lnc), s);
1597
1598 if (!reginfo && e < s) {
1599 e = s; /* Due to minlen logic of intuit() */
1600 }
0658cdde 1601
73104a1b
KW
1602 /* XXX Note that we could recalculate e to stop the loop earlier,
1603 * as the worst case expansion above will rarely be met, and as we
1604 * go along we would usually find that e moves further to the left.
1605 * This would happen only after we reached the point in the loop
1606 * where if there were no expansion we should fail. Unclear if
1607 * worth the expense */
1608
1609 while (s <= e) {
1610 char *my_strend= (char *)strend;
1611 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1612 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
73104a1b
KW
1613 && (!reginfo || regtry(reginfo, &s)) )
1614 {
1615 goto got_it;
1616 }
1617 s += (utf8_target) ? UTF8SKIP(s) : 1;
1618 }
1619 break;
1620 }
1621 case BOUNDL:
272d35c9 1622 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1623 FBC_BOUND(isWORDCHAR_LC,
1624 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1626 break;
1627 case NBOUNDL:
272d35c9 1628 RXp_MATCH_TAINTED_on(prog);
0eb30aeb
KW
1629 FBC_NBOUND(isWORDCHAR_LC,
1630 isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1632 break;
1633 case BOUND:
1634 FBC_BOUND(isWORDCHAR,
0eb30aeb 1635 isWORDCHAR_uni(tmp),
03940dc2 1636 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1637 break;
1638 case BOUNDA:
1639 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1640 isWORDCHAR_A(tmp),
1641 isWORDCHAR_A((U8*)s));
1642 break;
1643 case NBOUND:
1644 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1645 isWORDCHAR_uni(tmp),
03940dc2 1646 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1647 break;
1648 case NBOUNDA:
1649 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1650 isWORDCHAR_A(tmp),
1651 isWORDCHAR_A((U8*)s));
1652 break;
1653 case BOUNDU:
1654 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1655 isWORDCHAR_uni(tmp),
03940dc2 1656 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1657 break;
1658 case NBOUNDU:
1659 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1660 isWORDCHAR_uni(tmp),
03940dc2 1661 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1662 break;
73104a1b
KW
1663 case LNBREAK:
1664 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1665 is_LNBREAK_latin1_safe(s, strend)
1666 );
1667 break;
3018b823
KW
1668
1669 /* The argument to all the POSIX node types is the class number to pass to
1670 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1671
1672 case NPOSIXL:
1673 to_complement = 1;
1674 /* FALLTHROUGH */
1675
1676 case POSIXL:
272d35c9 1677 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1678 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1679 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1680 break;
3018b823
KW
1681
1682 case NPOSIXD:
1683 to_complement = 1;
1684 /* FALLTHROUGH */
1685
1686 case POSIXD:
1687 if (utf8_target) {
1688 goto posix_utf8;
1689 }
1690 goto posixa;
1691
1692 case NPOSIXA:
1693 if (utf8_target) {
1694 /* The complement of something that matches only ASCII matches all
1695 * UTF-8 variant code points, plus everything in ASCII that isn't
1696 * in the class */
1697 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1698 || ! _generic_isCC_A(*s, FLAGS(c)));
1699 break;
1700 }
1701
1702 to_complement = 1;
1703 /* FALLTHROUGH */
1704
73104a1b 1705 case POSIXA:
3018b823 1706 posixa:
73104a1b 1707 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1708 * byte invariant character. */
1709 REXEC_FBC_CLASS_SCAN(
1710 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1711 break;
3018b823
KW
1712
1713 case NPOSIXU:
1714 to_complement = 1;
1715 /* FALLTHROUGH */
1716
1717 case POSIXU:
1718 if (! utf8_target) {
1719 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1720 FLAGS(c))));
1721 }
1722 else {
1723
1724 posix_utf8:
1725 classnum = (_char_class_number) FLAGS(c);
1726 if (classnum < _FIRST_NON_SWASH_CC) {
1727 while (s < strend) {
1728
1729 /* We avoid loading in the swash as long as possible, but
1730 * should we have to, we jump to a separate loop. This
1731 * extra 'if' statement is what keeps this code from being
1732 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1733 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1734 goto found_above_latin1;
1735 }
1736 if ((UTF8_IS_INVARIANT(*s)
1737 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1738 classnum)))
1739 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1740 && to_complement ^ cBOOL(
1741 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1742 classnum))))
1743 {
1744 if (tmp && (!reginfo || regtry(reginfo, &s)))
1745 goto got_it;
1746 else {
1747 tmp = doevery;
1748 }
1749 }
1750 else {
1751 tmp = 1;
1752 }
1753 s += UTF8SKIP(s);
1754 }
1755 }
1756 else switch (classnum) { /* These classes are implemented as
1757 macros */
1758 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1759 revert the change of \v matching this */
1760 /* FALL THROUGH */
1761
1762 case _CC_ENUM_PSXSPC:
1763 REXEC_FBC_UTF8_CLASS_SCAN(
1764 to_complement ^ cBOOL(isSPACE_utf8(s)));
1765 break;
1766
1767 case _CC_ENUM_BLANK:
1768 REXEC_FBC_UTF8_CLASS_SCAN(
1769 to_complement ^ cBOOL(isBLANK_utf8(s)));
1770 break;
1771
1772 case _CC_ENUM_XDIGIT:
1773 REXEC_FBC_UTF8_CLASS_SCAN(
1774 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1775 break;
1776
1777 case _CC_ENUM_VERTSPACE:
1778 REXEC_FBC_UTF8_CLASS_SCAN(
1779 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1780 break;
1781
1782 case _CC_ENUM_CNTRL:
1783 REXEC_FBC_UTF8_CLASS_SCAN(
1784 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1785 break;
1786
1787 default:
1788 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1789 assert(0); /* NOTREACHED */
1790 }
1791 }
1792 break;
1793
1794 found_above_latin1: /* Here we have to load a swash to get the result
1795 for the current code point */
1796 if (! PL_utf8_swash_ptrs[classnum]) {
1797 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1798 PL_utf8_swash_ptrs[classnum] =
1799 _core_swash_init("utf8", swash_property_names[classnum],
1800 &PL_sv_undef, 1, 0, NULL, &flags);
1801 }
1802
1803 /* This is a copy of the loop above for swash classes, though using the
1804 * FBC macro instead of being expanded out. Since we've loaded the
1805 * swash, we don't have to check for that each time through the loop */
1806 REXEC_FBC_UTF8_CLASS_SCAN(
1807 to_complement ^ cBOOL(_generic_utf8(
1808 classnum,
1809 s,
1810 swash_fetch(PL_utf8_swash_ptrs[classnum],
1811 (U8 *) s, TRUE))));
73104a1b
KW
1812 break;
1813
1814 case AHOCORASICKC:
1815 case AHOCORASICK:
1816 {
1817 DECL_TRIE_TYPE(c);
1818 /* what trie are we using right now */
1819 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1820 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1821 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1822
1823 const char *last_start = strend - trie->minlen;
6148ee25 1824#ifdef DEBUGGING
73104a1b 1825 const char *real_start = s;
6148ee25 1826#endif
73104a1b
KW
1827 STRLEN maxlen = trie->maxlen;
1828 SV *sv_points;
1829 U8 **points; /* map of where we were in the input string
1830 when reading a given char. For ASCII this
1831 is unnecessary overhead as the relationship
1832 is always 1:1, but for Unicode, especially
1833 case folded Unicode this is not true. */
1834 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1835 U8 *bitmap=NULL;
1836
1837
1838 GET_RE_DEBUG_FLAGS_DECL;
1839
1840 /* We can't just allocate points here. We need to wrap it in
1841 * an SV so it gets freed properly if there is a croak while
1842 * running the match */
1843 ENTER;
1844 SAVETMPS;
1845 sv_points=newSV(maxlen * sizeof(U8 *));
1846 SvCUR_set(sv_points,
1847 maxlen * sizeof(U8 *));
1848 SvPOK_on(sv_points);
1849 sv_2mortal(sv_points);
1850 points=(U8**)SvPV_nolen(sv_points );
1851 if ( trie_type != trie_utf8_fold
1852 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1853 {
1854 if (trie->bitmap)
1855 bitmap=(U8*)trie->bitmap;
1856 else
1857 bitmap=(U8*)ANYOF_BITMAP(c);
1858 }
1859 /* this is the Aho-Corasick algorithm modified a touch
1860 to include special handling for long "unknown char" sequences.
1861 The basic idea being that we use AC as long as we are dealing
1862 with a possible matching char, when we encounter an unknown char
1863 (and we have not encountered an accepting state) we scan forward
1864 until we find a legal starting char.
1865 AC matching is basically that of trie matching, except that when
1866 we encounter a failing transition, we fall back to the current
1867 states "fail state", and try the current char again, a process
1868 we repeat until we reach the root state, state 1, or a legal
1869 transition. If we fail on the root state then we can either
1870 terminate if we have reached an accepting state previously, or
1871 restart the entire process from the beginning if we have not.
1872
1873 */
1874 while (s <= last_start) {
1875 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1876 U8 *uc = (U8*)s;
1877 U16 charid = 0;
1878 U32 base = 1;
1879 U32 state = 1;
1880 UV uvc = 0;
1881 STRLEN len = 0;
1882 STRLEN foldlen = 0;
1883 U8 *uscan = (U8*)NULL;
1884 U8 *leftmost = NULL;
1885#ifdef DEBUGGING
1886 U32 accepted_word= 0;
786e8c11 1887#endif
73104a1b
KW
1888 U32 pointpos = 0;
1889
1890 while ( state && uc <= (U8*)strend ) {
1891 int failed=0;
1892 U32 word = aho->states[ state ].wordnum;
1893
1894 if( state==1 ) {
1895 if ( bitmap ) {
1896 DEBUG_TRIE_EXECUTE_r(
1897 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1898 dump_exec_pos( (char *)uc, c, strend, real_start,
1899 (char *)uc, utf8_target );
1900 PerlIO_printf( Perl_debug_log,
1901 " Scanning for legal start char...\n");
1902 }
1903 );
1904 if (utf8_target) {
1905 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1906 uc += UTF8SKIP(uc);
1907 }
1908 } else {
1909 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1910 uc++;
1911 }
786e8c11 1912 }
73104a1b 1913 s= (char *)uc;
07be1b83 1914 }
73104a1b
KW
1915 if (uc >(U8*)last_start) break;
1916 }
1917
1918 if ( word ) {
1919 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1920 if (!leftmost || lpos < leftmost) {
1921 DEBUG_r(accepted_word=word);
1922 leftmost= lpos;
7016d6eb 1923 }
73104a1b 1924 if (base==0) break;
7016d6eb 1925
73104a1b
KW
1926 }
1927 points[pointpos++ % maxlen]= uc;
1928 if (foldlen || uc < (U8*)strend) {
1929 REXEC_TRIE_READ_CHAR(trie_type, trie,
1930 widecharmap, uc,
1931 uscan, len, uvc, charid, foldlen,
1932 foldbuf, uniflags);
1933 DEBUG_TRIE_EXECUTE_r({
1934 dump_exec_pos( (char *)uc, c, strend,
1935 real_start, s, utf8_target);
1936 PerlIO_printf(Perl_debug_log,
1937 " Charid:%3u CP:%4"UVxf" ",
1938 charid, uvc);
1939 });
1940 }
1941 else {
1942 len = 0;
1943 charid = 0;
1944 }
07be1b83 1945
73104a1b
KW
1946
1947 do {
6148ee25 1948#ifdef DEBUGGING
73104a1b 1949 word = aho->states[ state ].wordnum;
6148ee25 1950#endif
73104a1b
KW
1951 base = aho->states[ state ].trans.base;
1952
1953 DEBUG_TRIE_EXECUTE_r({
1954 if (failed)
1955 dump_exec_pos( (char *)uc, c, strend, real_start,
1956 s, utf8_target );
1957 PerlIO_printf( Perl_debug_log,
1958 "%sState: %4"UVxf", word=%"UVxf,
1959 failed ? " Fail transition to " : "",
1960 (UV)state, (UV)word);
1961 });
1962 if ( base ) {
1963 U32 tmp;
1964 I32 offset;
1965 if (charid &&
1966 ( ((offset = base + charid
1967 - 1 - trie->uniquecharcount)) >= 0)
1968 && ((U32)offset < trie->lasttrans)
1969 && trie->trans[offset].check == state
1970 && (tmp=trie->trans[offset].next))
1971 {
1972 DEBUG_TRIE_EXECUTE_r(
1973 PerlIO_printf( Perl_debug_log," - legal\n"));
1974 state = tmp;
1975 break;
07be1b83
YO
1976 }
1977 else {
786e8c11 1978 DEBUG_TRIE_EXECUTE_r(
73104a1b 1979 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 1980 failed = 1;
73104a1b 1981 state = aho->fail[state];
07be1b83 1982 }
07be1b83 1983 }
73104a1b
KW
1984 else {
1985 /* we must be accepting here */
1986 DEBUG_TRIE_EXECUTE_r(
1987 PerlIO_printf( Perl_debug_log," - accepting\n"));
1988 failed = 1;
1989 break;
786e8c11 1990 }
73104a1b
KW
1991 } while(state);
1992 uc += len;
1993 if (failed) {
1994 if (leftmost)
1995 break;
1996 if (!state) state = 1;
07be1b83 1997 }
73104a1b
KW
1998 }
1999 if ( aho->states[ state ].wordnum ) {
2000 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2001 if (!leftmost || lpos < leftmost) {
2002 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2003 leftmost = lpos;
07be1b83
YO
2004 }
2005 }
73104a1b
KW
2006 if (leftmost) {
2007 s = (char*)leftmost;
2008 DEBUG_TRIE_EXECUTE_r({
2009 PerlIO_printf(
2010 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2011 (UV)accepted_word, (IV)(s - real_start)
2012 );
2013 });
2014 if (!reginfo || regtry(reginfo, &s)) {
2015 FREETMPS;
2016 LEAVE;
2017 goto got_it;
2018 }
2019 s = HOPc(s,1);
2020 DEBUG_TRIE_EXECUTE_r({
2021 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2022 });
2023 } else {
2024 DEBUG_TRIE_EXECUTE_r(
2025 PerlIO_printf( Perl_debug_log,"No match.\n"));
2026 break;
2027 }
2028 }
2029 FREETMPS;
2030 LEAVE;
2031 }
2032 break;
2033 default:
2034 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2035 break;
2036 }
2037 return 0;
2038 got_it:
2039 return s;
6eb5f6b9
JH
2040}
2041
fae667d5 2042
6eb5f6b9
JH
2043/*
2044 - regexec_flags - match a regexp against a string
2045 */
2046I32
5aaab254 2047Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2048 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2049/* stringarg: the point in the string at which to begin matching */
2050/* strend: pointer to null at end of string */
2051/* strbeg: real beginning of string */
2052/* minend: end of match must be >= minend bytes after stringarg. */
2053/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2054 * itself is accessed via the pointers above */
2055/* data: May be used for some additional optimizations.
2056 Currently its only used, with a U32 cast, for transmitting
2057 the ganch offset when doing a /g match. This will change */
2058/* nosave: For optimizations. */
2059
6eb5f6b9 2060{
97aff369 2061 dVAR;
8d919b0a 2062 struct regexp *const prog = ReANY(rx);
5aaab254 2063 char *s;
eb578fdb 2064 regnode *c;
5aaab254 2065 char *startpos = stringarg;
6eb5f6b9
JH
2066 I32 minlen; /* must match at least this many chars */
2067 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2068 I32 end_shift = 0; /* Same for the end. */ /* CC */
2069 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2070 char *scream_olds = NULL;
f2ed9b32 2071 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2072 I32 multiline;
f8fc2ecf 2073 RXi_GET_DECL(prog,progi);
3b0527fe 2074 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2075 regexp_paren_pair *swap = NULL;
a3621e74
YO
2076 GET_RE_DEBUG_FLAGS_DECL;
2077
7918f24d 2078 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2079 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2080
2081 /* Be paranoid... */
2082 if (prog == NULL || startpos == NULL) {
2083 Perl_croak(aTHX_ "NULL regexp parameter");
2084 return 0;
2085 }
2086
bbe252da 2087 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2088 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2089
f2ed9b32 2090 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2091 DEBUG_EXECUTE_r(
f2ed9b32 2092 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2093 "Matching");
2094 );
bac06658 2095
6eb5f6b9 2096 minlen = prog->minlen;
1de06328
YO
2097
2098 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2100 "String too short [regexec_flags]...\n"));
2101 goto phooey;
1aa99e6b 2102 }
6eb5f6b9 2103
1de06328 2104
6eb5f6b9 2105 /* Check validity of program. */
f8fc2ecf 2106 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2107 Perl_croak(aTHX_ "corrupted regexp program");
2108 }
2109
272d35c9 2110 RX_MATCH_TAINTED_off(rx);
ed301438 2111 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2112 PL_reg_maxiter = 0;
2113
984e6dd1 2114 reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
39819bd9 2115 reginfo.warned = FALSE;
6eb5f6b9 2116 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2117 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2118 PL_bostr = strbeg;
3b0527fe 2119 reginfo.sv = sv;
6eb5f6b9
JH
2120
2121 /* Mark end of line for $ (and such) */
2122 PL_regeol = strend;
2123
2124 /* see how far we have to get to not match where we matched before */
3b0527fe 2125 reginfo.till = startpos+minend;
6eb5f6b9 2126
6eb5f6b9
JH
2127 /* If there is a "must appear" string, look for it. */
2128 s = startpos;
2129
bbe252da 2130 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2131 MAGIC *mg;
2c296965 2132 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2133 reginfo.ganch = startpos + prog->gofs;
2c296965 2134 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2135 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2136 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2137 && SvMAGIC(sv)
14befaf4
DM
2138 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2139 && mg->mg_len >= 0) {
3b0527fe 2140 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2141 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2142 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2143
bbe252da 2144 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2145 if (s > reginfo.ganch)
6eb5f6b9 2146 goto phooey;
58e23c8d 2147 s = reginfo.ganch - prog->gofs;
2c296965 2148 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2149 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2150 if (s < strbeg)
2151 goto phooey;
6eb5f6b9
JH
2152 }
2153 }
58e23c8d 2154 else if (data) {
70685ca0 2155 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2156 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2157 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2158
2159 } else { /* pos() not defined */
3b0527fe 2160 reginfo.ganch = strbeg;
2c296965
YO
2161 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2162 "GPOS: reginfo.ganch = strbeg\n"));
2163 }
6eb5f6b9 2164 }
288b8c02 2165 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2166 /* We have to be careful. If the previous successful match
2167 was from this regex we don't want a subsequent partially
2168 successful match to clobber the old results.
2169 So when we detect this possibility we add a swap buffer
d8da0584
KW
2170 to the re, and switch the buffer each match. If we fail,
2171 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2172 */
2173 swap = prog->offs;
2174 /* do we need a save destructor here for eval dies? */
2175 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2176 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2177 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2178 PTR2UV(prog),
2179 PTR2UV(swap),
2180 PTR2UV(prog->offs)
2181 ));
c74340f9 2182 }
a0714e2c 2183 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2184 re_scream_pos_data d;
2185
2186 d.scream_olds = &scream_olds;
2187 d.scream_pos = &scream_pos;
288b8c02 2188 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2189 if (!s) {
a3621e74 2190 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2191 goto phooey; /* not present */
3fa9c3d7 2192 }
6eb5f6b9
JH
2193 }
2194
1de06328 2195
6eb5f6b9
JH
2196
2197 /* Simplest case: anchored match need be tried only once. */
2198 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2199 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2200 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2201 goto got_it;
bbe252da
YO
2202 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2203 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2204 {
2205 char *end;
2206
2207 if (minlen)
2208 dontbother = minlen - 1;
1aa99e6b 2209 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2210 /* for multiline we only have to try after newlines */
33b8afdf 2211 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2212 /* because of the goto we can not easily reuse the macros for bifurcating the
2213 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2214 if (utf8_target) {
2215 if (s == startpos)
2216 goto after_try_utf8;
2217 while (1) {
2218 if (regtry(&reginfo, &s)) {
2219 goto got_it;
2220 }
2221 after_try_utf8:
2222 if (s > end) {
2223 goto phooey;
2224 }
2225 if (prog->extflags & RXf_USE_INTUIT) {
2226 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2227 if (!s) {
2228 goto phooey;
2229 }
2230 }
2231 else {
2232 s += UTF8SKIP(s);
2233 }
2234 }
2235 } /* end search for check string in unicode */
2236 else {
2237 if (s == startpos) {
2238 goto after_try_latin;
2239 }
2240 while (1) {
2241 if (regtry(&reginfo, &s)) {
2242 goto got_it;
2243 }
2244 after_try_latin:
2245 if (s > end) {
2246 goto phooey;
2247 }
2248 if (prog->extflags & RXf_USE_INTUIT) {
2249 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2250 if (!s) {
2251 goto phooey;
2252 }
2253 }
2254 else {
2255 s++;
2256 }
2257 }
2258 } /* end search for check string in latin*/
2259 } /* end search for check string */
2260 else { /* search for newline */
2261 if (s > startpos) {
2262 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2263 s--;
92f3d482 2264 }
21eede78
YO
2265 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2266 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2267 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2268 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2269 goto got_it;
2270 }
92f3d482
YO
2271 }
2272 } /* end search for newline */
2273 } /* end anchored/multiline check string search */
6eb5f6b9 2274 goto phooey;
bbe252da 2275 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2276 {
486ec47a 2277 /* the warning about reginfo.ganch being used without initialization
bbe252da 2278 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2279 and we only enter this block when the same bit is set. */
58e23c8d 2280 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2281
2282 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2283 goto got_it;
2284 goto phooey;
2285 }
2286
2287 /* Messy cases: unanchored match. */
bbe252da 2288 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2289 /* we have /x+whatever/ */
984e6dd1 2290 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2291 char ch;
bf93d4cc
GS
2292#ifdef DEBUGGING
2293 int did_match = 0;
2294#endif
f2ed9b32 2295 if (utf8_target) {
7e0d5ad7
KW
2296 if (! prog->anchored_utf8) {
2297 to_utf8_substr(prog);
2298 }
2299 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2300 REXEC_FBC_SCAN(
6eb5f6b9 2301 if (*s == ch) {
a3621e74 2302 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2303 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2304 s += UTF8SKIP(s);
2305 while (s < strend && *s == ch)
2306 s += UTF8SKIP(s);
2307 }
4cadc6a9 2308 );
7e0d5ad7 2309
6eb5f6b9
JH
2310 }
2311 else {
7e0d5ad7
KW
2312 if (! prog->anchored_substr) {
2313 if (! to_byte_substr(prog)) {
6b54ddc5 2314 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2315 }
2316 }
2317 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2318 REXEC_FBC_SCAN(
6eb5f6b9 2319 if (*s == ch) {
a3621e74 2320 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2321 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2322 s++;
2323 while (s < strend && *s == ch)
2324 s++;
2325 }
4cadc6a9 2326 );
6eb5f6b9 2327 }
a3621e74 2328 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2329 PerlIO_printf(Perl_debug_log,
b7953727
JH
2330 "Did not find anchored character...\n")
2331 );
6eb5f6b9 2332 }
a0714e2c
SS
2333 else if (prog->anchored_substr != NULL
2334 || prog->anchored_utf8 != NULL
2335 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2336 && prog->float_max_offset < strend - s)) {
2337 SV *must;
2338 I32 back_max;
2339 I32 back_min;
2340 char *last;
6eb5f6b9 2341 char *last1; /* Last position checked before */
bf93d4cc
GS
2342#ifdef DEBUGGING
2343 int did_match = 0;
2344#endif
33b8afdf 2345 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2346 if (utf8_target) {
2347 if (! prog->anchored_utf8) {
2348 to_utf8_substr(prog);
2349 }
2350 must = prog->anchored_utf8;
2351 }
2352 else {
2353 if (! prog->anchored_substr) {
2354 if (! to_byte_substr(prog)) {
6b54ddc5 2355 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2356 }
2357 }
2358 must = prog->anchored_substr;
2359 }
33b8afdf
JH
2360 back_max = back_min = prog->anchored_offset;
2361 } else {
7e0d5ad7
KW
2362 if (utf8_target) {
2363 if (! prog->float_utf8) {
2364 to_utf8_substr(prog);
2365 }
2366 must = prog->float_utf8;
2367 }
2368 else {
2369 if (! prog->float_substr) {
2370 if (! to_byte_substr(prog)) {
6b54ddc5 2371 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2372 }
2373 }
2374 must = prog->float_substr;
2375 }
33b8afdf
JH
2376 back_max = prog->float_max_offset;
2377 back_min = prog->float_min_offset;
2378 }
1de06328 2379
1de06328
YO
2380 if (back_min<0) {
2381 last = strend;
2382 } else {
2383 last = HOP3c(strend, /* Cannot start after this */
2384 -(I32)(CHR_SVLEN(must)
2385 - (SvTAIL(must) != 0) + back_min), strbeg);
2386 }
6eb5f6b9
JH
2387 if (s > PL_bostr)
2388 last1 = HOPc(s, -1);
2389 else
2390 last1 = s - 1; /* bogus */
2391
a0288114 2392 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2393 check_substr==must. */
2394 scream_pos = -1;
2395 dontbother = end_shift;
2396 strend = HOPc(strend, -dontbother);
2397 while ( (s <= last) &&
c33e64f0 2398 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2399 (unsigned char*)strend, must,
c33e64f0 2400 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2401 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2402 if (HOPc(s, -back_max) > last1) {
2403 last1 = HOPc(s, -back_min);
2404 s = HOPc(s, -back_max);
2405 }
2406 else {
52657f30 2407 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2408
2409 last1 = HOPc(s, -back_min);
52657f30 2410 s = t;
6eb5f6b9 2411 }
f2ed9b32 2412 if (utf8_target) {
6eb5f6b9 2413 while (s <= last1) {
24b23f37 2414 if (regtry(&reginfo, &s))
6eb5f6b9 2415 goto got_it;
7016d6eb
DM
2416 if (s >= last1) {
2417 s++; /* to break out of outer loop */
2418 break;
2419 }
2420 s += UTF8SKIP(s);
6eb5f6b9
JH
2421 }
2422 }
2423 else {
2424 while (s <= last1) {
24b23f37 2425 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2426 goto got_it;
2427 s++;
2428 }
2429 }
2430 }
ab3bbdeb 2431 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2432 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2433 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2434 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2435 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2436 ? "anchored" : "floating"),
ab3bbdeb
YO
2437 quoted, RE_SV_TAIL(must));
2438 });
6eb5f6b9
JH
2439 goto phooey;
2440 }
f8fc2ecf 2441 else if ( (c = progi->regstclass) ) {
f14c76ed 2442 if (minlen) {
f8fc2ecf 2443 const OPCODE op = OP(progi->regstclass);
66e933ab 2444 /* don't bother with what can't match */
786e8c11 2445 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2446 strend = HOPc(strend, -(minlen - 1));
2447 }
a3621e74 2448 DEBUG_EXECUTE_r({
be8e71aa 2449 SV * const prop = sv_newmortal();
32fc9b6a 2450 regprop(prog, prop, c);
0df25f3d 2451 {
f2ed9b32 2452 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2453 s,strend-s,60);
0df25f3d 2454 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2455 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2456 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2457 quoted, (int)(strend - s));
0df25f3d 2458 }
ffc61ed2 2459 });
984e6dd1 2460 if (find_byclass(prog, c, s, strend, &reginfo, reginfo.is_utf8_pat))
6eb5f6b9 2461 goto got_it;
07be1b83 2462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2463 }
2464 else {
2465 dontbother = 0;
a0714e2c 2466 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2467 /* Trim the end. */
6af40bd7 2468 char *last= NULL;
33b8afdf 2469 SV* float_real;
c33e64f0
FC
2470 STRLEN len;
2471 const char *little;
33b8afdf 2472
7e0d5ad7
KW
2473 if (utf8_target) {
2474 if (! prog->float_utf8) {
2475 to_utf8_substr(prog);
2476 }
2477 float_real = prog->float_utf8;
2478 }
2479 else {
2480 if (! prog->float_substr) {
2481 if (! to_byte_substr(prog)) {
6b54ddc5 2482 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2483 }
2484 }
2485 float_real = prog->float_substr;
2486 }
d6a28714 2487
c33e64f0
FC
2488 little = SvPV_const(float_real, len);
2489 if (SvTAIL(float_real)) {
7f18ad16
KW
2490 /* This means that float_real contains an artificial \n on
2491 * the end due to the presence of something like this:
2492 * /foo$/ where we can match both "foo" and "foo\n" at the
2493 * end of the string. So we have to compare the end of the
2494 * string first against the float_real without the \n and
2495 * then against the full float_real with the string. We
2496 * have to watch out for cases where the string might be
2497 * smaller than the float_real or the float_real without
2498 * the \n. */
1a13b075
YO
2499 char *checkpos= strend - len;
2500 DEBUG_OPTIMISE_r(
2501 PerlIO_printf(Perl_debug_log,
2502 "%sChecking for float_real.%s\n",
2503 PL_colors[4], PL_colors[5]));
2504 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2505 /* can't match, even if we remove the trailing \n
2506 * string is too short to match */
1a13b075
YO
2507 DEBUG_EXECUTE_r(
2508 PerlIO_printf(Perl_debug_log,
2509 "%sString shorter than required trailing substring, cannot match.%s\n",
2510 PL_colors[4], PL_colors[5]));
2511 goto phooey;
2512 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2513 /* can match, the end of the string matches without the
2514 * "\n" */
1a13b075
YO
2515 last = checkpos + 1;
2516 } else if (checkpos < strbeg) {
7f18ad16
KW
2517 /* cant match, string is too short when the "\n" is
2518 * included */
1a13b075
YO
2519 DEBUG_EXECUTE_r(
2520 PerlIO_printf(Perl_debug_log,
2521 "%sString does not contain required trailing substring, cannot match.%s\n",
2522 PL_colors[4], PL_colors[5]));
2523 goto phooey;
2524 } else if (!multiline) {
7f18ad16
KW
2525 /* non multiline match, so compare with the "\n" at the
2526 * end of the string */
1a13b075
YO
2527 if (memEQ(checkpos, little, len)) {
2528 last= checkpos;
2529 } else {
2530 DEBUG_EXECUTE_r(
2531 PerlIO_printf(Perl_debug_log,
2532 "%sString does not contain required trailing substring, cannot match.%s\n",
2533 PL_colors[4], PL_colors[5]));
2534 goto phooey;
2535 }
2536 } else {
7f18ad16
KW
2537 /* multiline match, so we have to search for a place
2538 * where the full string is located */
d6a28714 2539 goto find_last;
1a13b075 2540 }
c33e64f0 2541 } else {
d6a28714 2542 find_last:
9041c2e3 2543 if (len)
d6a28714 2544 last = rninstr(s, strend, little, little + len);
b8c5462f 2545 else
a0288114 2546 last = strend; /* matching "$" */
b8c5462f 2547 }
6af40bd7 2548 if (!last) {
7f18ad16
KW
2549 /* at one point this block contained a comment which was
2550 * probably incorrect, which said that this was a "should not
2551 * happen" case. Even if it was true when it was written I am
2552 * pretty sure it is not anymore, so I have removed the comment
2553 * and replaced it with this one. Yves */
6bda09f9
YO
2554 DEBUG_EXECUTE_r(
2555 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2556 "String does not contain required substring, cannot match.\n"
2557 ));
2558 goto phooey;
bf93d4cc 2559 }
d6a28714
JH
2560 dontbother = strend - last + prog->float_min_offset;
2561 }
2562 if (minlen && (dontbother < minlen))
2563 dontbother = minlen - 1;
2564 strend -= dontbother; /* this one's always in bytes! */
2565 /* We don't know much -- general case. */
f2ed9b32 2566 if (utf8_target) {
d6a28714 2567 for (;;) {
24b23f37 2568 if (regtry(&reginfo, &s))
d6a28714
JH
2569 goto got_it;
2570 if (s >= strend)
2571 break;
b8c5462f 2572 s += UTF8SKIP(s);
d6a28714
JH
2573 };
2574 }
2575 else {
2576 do {
24b23f37 2577 if (regtry(&reginfo, &s))
d6a28714
JH
2578 goto got_it;
2579 } while (s++ < strend);
2580 }
2581 }
2582
2583 /* Failure. */
2584 goto phooey;
2585
2586got_it:
495f47a5
DM
2587 DEBUG_BUFFERS_r(
2588 if (swap)
2589 PerlIO_printf(Perl_debug_log,
2590 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2591 PTR2UV(prog),
2592 PTR2UV(swap)
2593 );
2594 );
e9105d30 2595 Safefree(swap);
d6a28714 2596
ed301438 2597 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2598 restore_pos(aTHX_ prog);
5daac39c
NC
2599 if (RXp_PAREN_NAMES(prog))
2600 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2601
2602 /* make sure $`, $&, $', and $digit will work later */
2603 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2604 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2605#ifdef PERL_ANY_COW
2606 if (SvCANCOW(sv)) {
ed252734
NC
2607 if (DEBUG_C_TEST) {
2608 PerlIO_printf(Perl_debug_log,
2609 "Copy on write: regexp capture, type %d\n",
2610 (int) SvTYPE(sv));
2611 }
77f8f7c1 2612 RX_MATCH_COPY_FREE(rx);
ed252734 2613 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2614 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2615 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2616 prog->sublen = PL_regeol - strbeg;
2617 prog->suboffset = 0;
2618 prog->subcoffset = 0;
ed252734
NC
2619 } else
2620#endif
2621 {
6502e081
DM
2622 I32 min = 0;
2623 I32 max = PL_regeol - strbeg;
2624 I32 sublen;
2625
2626 if ( (flags & REXEC_COPY_SKIP_POST)
2627 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2628 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2629 ) { /* don't copy $' part of string */
3de645a8 2630 U32 n = 0;
6502e081
DM
2631 max = -1;
2632 /* calculate the right-most part of the string covered
2633 * by a capture. Due to look-ahead, this may be to
2634 * the right of $&, so we have to scan all captures */
2635 while (n <= prog->lastparen) {
2636 if (prog->offs[n].end > max)
2637 max = prog->offs[n].end;
2638 n++;
2639 }
2640 if (max == -1)
2641 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2642 ? prog->offs[0].start
2643 : 0;
2644 assert(max >= 0 && max <= PL_regeol - strbeg);
2645 }
2646
2647 if ( (flags & REXEC_COPY_SKIP_PRE)
2648 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2649 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2650 ) { /* don't copy $` part of string */
3de645a8 2651 U32 n = 0;
6502e081
DM
2652 min = max;
2653 /* calculate the left-most part of the string covered
2654 * by a capture. Due to look-behind, this may be to
2655 * the left of $&, so we have to scan all captures */
2656 while (min && n <= prog->lastparen) {
2657 if ( prog->offs[n].start != -1
2658 && prog->offs[n].start < min)
2659 {
2660 min = prog->offs[n].start;
2661 }
2662 n++;
2663 }
2664 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2665 && min > prog->offs[0].end
2666 )
2667 min = prog->offs[0].end;
2668
2669 }
2670
2671 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2672 sublen = max - min;
2673
2674 if (RX_MATCH_COPIED(rx)) {
2675 if (sublen > prog->sublen)
2676 prog->subbeg =
2677 (char*)saferealloc(prog->subbeg, sublen+1);
2678 }
2679 else
2680 prog->subbeg = (char*)safemalloc(sublen+1);
2681 Copy(strbeg + min, prog->subbeg, sublen, char);
2682 prog->subbeg[sublen] = '\0';
2683 prog->suboffset = min;
2684 prog->sublen = sublen;
77f8f7c1 2685 RX_MATCH_COPIED_on(rx);
6502e081 2686 }
6502e081
DM
2687 prog->subcoffset = prog->suboffset;
2688 if (prog->suboffset && utf8_target) {
2689 /* Convert byte offset to chars.
2690 * XXX ideally should only compute this if @-/@+
2691 * has been seen, a la PL_sawampersand ??? */
2692
2693 /* If there's a direct correspondence between the
2694 * string which we're matching and the original SV,
2695 * then we can use the utf8 len cache associated with
2696 * the SV. In particular, it means that under //g,
2697 * sv_pos_b2u() will use the previously cached
2698 * position to speed up working out the new length of
2699 * subcoffset, rather than counting from the start of
2700 * the string each time. This stops
2701 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2702 * from going quadratic */
2703 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2704 sv_pos_b2u(sv, &(prog->subcoffset));
2705 else
2706 prog->subcoffset = utf8_length((U8*)strbeg,
2707 (U8*)(strbeg+prog->suboffset));
2708 }
d6a28714
JH
2709 }
2710 else {
6502e081 2711 RX_MATCH_COPY_FREE(rx);
d6a28714 2712 prog->subbeg = strbeg;
6502e081
DM
2713 prog->suboffset = 0;
2714 prog->subcoffset = 0;
d6a28714
JH
2715 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2716 }
2717 }
9041c2e3 2718
d6a28714
JH
2719 return 1;
2720
2721phooey:
a3621e74 2722 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2723 PL_colors[4], PL_colors[5]));
ed301438 2724 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2725 restore_pos(aTHX_ prog);
e9105d30 2726 if (swap) {
c74340f9 2727 /* we failed :-( roll it back */
495f47a5
DM
2728 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2729 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2730 PTR2UV(prog),
2731 PTR2UV(prog->offs),
2732 PTR2UV(swap)
2733 ));
e9105d30
GG
2734 Safefree(prog->offs);
2735 prog->offs = swap;
2736 }
d6a28714
JH
2737 return 0;
2738}
2739
6bda09f9 2740
ec43f78b
DM
2741/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2742 * Do inc before dec, in case old and new rex are the same */
2743#define SET_reg_curpm(Re2) \
2744 if (PL_reg_state.re_state_eval_setup_done) { \
2745 (void)ReREFCNT_inc(Re2); \
2746 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2747 PM_SETRE((PL_reg_curpm), (Re2)); \
2748 }
2749
2750
d6a28714
JH
2751/*
2752 - regtry - try match at specific point
2753 */
2754STATIC I32 /* 0 failure, 1 success */
f73aaa43 2755S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2756{
97aff369 2757 dVAR;
d6a28714 2758 CHECKPOINT lastcp;
288b8c02 2759 REGEXP *const rx = reginfo->prog;
8d919b0a 2760 regexp *const prog = ReANY(rx);
f73aaa43 2761 I32 result;
f8fc2ecf 2762 RXi_GET_DECL(prog,progi);
a3621e74 2763 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2764
2765 PERL_ARGS_ASSERT_REGTRY;
2766
24b23f37 2767 reginfo->cutpoint=NULL;
d6a28714 2768
ed301438
DM
2769 if ((prog->extflags & RXf_EVAL_SEEN)
2770 && !PL_reg_state.re_state_eval_setup_done)
2771 {
d6a28714
JH
2772 MAGIC *mg;
2773
ed301438 2774 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2775 if (reginfo->sv) {
d6a28714 2776 /* Make $_ available to executed code. */
3b0527fe 2777 if (reginfo->sv != DEFSV) {
59f00321 2778 SAVE_DEFSV;
414bf5ae 2779 DEFSV_set(reginfo->sv);
b8c5462f 2780 }
d6a28714 2781
3b0527fe
DM
2782 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2783 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2784 /* prepare for quick setting of pos */
d300d9fa 2785#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2786 if (SvIsCOW(reginfo->sv))
2787 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2788#endif
3dab1dad 2789 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2790 &PL_vtbl_mglob, NULL, 0);
d6a28714 2791 mg->mg_len = -1;
b8c5462f 2792 }
d6a28714
JH
2793 PL_reg_magic = mg;
2794 PL_reg_oldpos = mg->mg_len;
4f639d21 2795 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2796 }
09687e5a 2797 if (!PL_reg_curpm) {
a02a5408 2798 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2799#ifdef USE_ITHREADS
2800 {
14a49a24 2801 SV* const repointer = &PL_sv_undef;
92313705
NC
2802 /* this regexp is also owned by the new PL_reg_curpm, which
2803 will try to free it. */
d2ece331 2804 av_push(PL_regex_padav, repointer);
09687e5a
AB
2805 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2806 PL_regex_pad = AvARRAY(PL_regex_padav);
2807 }
2808#endif
2809 }
ec43f78b 2810 SET_reg_curpm(rx);
d6a28714
JH
2811 PL_reg_oldcurpm = PL_curpm;
2812 PL_curpm = PL_reg_curpm;
07bc277f 2813 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2814 /* Here is a serious problem: we cannot rewrite subbeg,
2815 since it may be needed if this match fails. Thus
2816 $` inside (?{}) could fail... */
2817 PL_reg_oldsaved = prog->subbeg;
2818 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2819 PL_reg_oldsavedoffset = prog->suboffset;
2820 PL_reg_oldsavedcoffset = prog->suboffset;
db2c6cb3 2821#ifdef PERL_ANY_COW
ed252734
NC
2822 PL_nrs = prog->saved_copy;
2823#endif
07bc277f 2824 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2825 }
2826 else
bd61b366 2827 PL_reg_oldsaved = NULL;
d6a28714 2828 prog->subbeg = PL_bostr;
6502e081
DM
2829 prog->suboffset = 0;
2830 prog->subcoffset = 0;
d6a28714
JH
2831 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2832 }
97ca13b7 2833#ifdef DEBUGGING
f73aaa43 2834 PL_reg_starttry = *startposp;
97ca13b7 2835#endif
f73aaa43 2836 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2837 prog->lastparen = 0;
03994de8 2838 prog->lastcloseparen = 0;
d6a28714
JH
2839
2840 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2841 to do this again and again, prog->lastparen should take care of
3dd2943c 2842 this! --ilya*/
dafc8851
JH
2843
2844 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2845 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2846 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2847 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2848 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2849 * Meanwhile, this code *is* needed for the
daf18116
JH
2850 * above-mentioned test suite tests to succeed. The common theme
2851 * on those tests seems to be returning null fields from matches.
225593e1 2852 * --jhi updated by dapm */
dafc8851 2853#if 1
d6a28714 2854 if (prog->nparens) {
b93070ed 2855 regexp_paren_pair *pp = prog->offs;
eb578fdb 2856 I32 i;
b93070ed 2857 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2858 ++pp;
2859 pp->start = -1;
2860 pp->end = -1;
d6a28714
JH
2861 }
2862 }
dafc8851 2863#endif
02db2b7b 2864 REGCP_SET(lastcp);
f73aaa43
DM
2865 result = regmatch(reginfo, *startposp, progi->program + 1);
2866 if (result != -1) {
2867 prog->offs[0].end = result;
d6a28714
JH
2868 return 1;
2869 }
24b23f37 2870 if (reginfo->cutpoint)
f73aaa43 2871 *startposp= reginfo->cutpoint;
02db2b7b 2872 REGCP_UNWIND(lastcp);
d6a28714
JH
2873 return 0;
2874}
2875
02db2b7b 2876
8ba1375e
MJD
2877#define sayYES goto yes
2878#define sayNO goto no
262b90c4 2879#define sayNO_SILENT goto no_silent
8ba1375e 2880
f9f4320a
YO
2881/* we dont use STMT_START/END here because it leads to
2882 "unreachable code" warnings, which are bogus, but distracting. */
2883#define CACHEsayNO \
c476f425
DM
2884 if (ST.cache_mask) \
2885 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2886 sayNO
3298f257 2887
a3621e74 2888/* this is used to determine how far from the left messages like
265c4333
YO
2889 'failed...' are printed. It should be set such that messages
2890 are inline with the regop output that created them.
a3621e74 2891*/
265c4333 2892#define REPORT_CODE_OFF 32
a3621e74
YO
2893
2894
40a82448
DM
2895#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2896#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2897#define CHRTEST_NOT_A_CP_1 -999
2898#define CHRTEST_NOT_A_CP_2 -998
9e137952 2899
86545054
DM
2900#define SLAB_FIRST(s) (&(s)->states[0])
2901#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2902
5d9a96ca
DM
2903/* grab a new slab and return the first slot in it */
2904
2905STATIC regmatch_state *
2906S_push_slab(pTHX)
2907{
a35a87e7 2908#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2909 dMY_CXT;
2910#endif
5d9a96ca
DM
2911 regmatch_slab *s = PL_regmatch_slab->next;
2912 if (!s) {
2913 Newx(s, 1, regmatch_slab);
2914 s->prev = PL_regmatch_slab;
2915 s->next = NULL;
2916 PL_regmatch_slab->next = s;
2917 }
2918 PL_regmatch_slab = s;
86545054 2919 return SLAB_FIRST(s);
5d9a96ca 2920}
5b47454d 2921
95b24440 2922
40a82448
DM
2923/* push a new state then goto it */
2924
4d5016e5
DM
2925#define PUSH_STATE_GOTO(state, node, input) \
2926 pushinput = input; \
40a82448
DM
2927 scan = node; \
2928 st->resume_state = state; \
2929 goto push_state;
2930
2931/* push a new state with success backtracking, then goto it */
2932
4d5016e5
DM
2933#define PUSH_YES_STATE_GOTO(state, node, input) \
2934 pushinput = input; \
40a82448
DM
2935 scan = node; \
2936 st->resume_state = state; \
2937 goto push_yes_state;
2938
aa283a38 2939
aa283a38 2940
4d5016e5 2941
d6a28714 2942/*
95b24440 2943
bf1f174e
DM
2944regmatch() - main matching routine
2945
2946This is basically one big switch statement in a loop. We execute an op,
2947set 'next' to point the next op, and continue. If we come to a point which
2948we may need to backtrack to on failure such as (A|B|C), we push a
2949backtrack state onto the backtrack stack. On failure, we pop the top
2950state, and re-enter the loop at the state indicated. If there are no more
2951states to pop, we return failure.
2952
2953Sometimes we also need to backtrack on success; for example /A+/, where
2954after successfully matching one A, we need to go back and try to
2955match another one; similarly for lookahead assertions: if the assertion
2956completes successfully, we backtrack to the state just before the assertion
2957and then carry on. In these cases, the pushed state is marked as
2958'backtrack on success too'. This marking is in fact done by a chain of
2959pointers, each pointing to the previous 'yes' state. On success, we pop to
2960the nearest yes state, discarding any intermediate failure-only states.
2961Sometimes a yes state is pushed just to force some cleanup code to be
2962called at the end of a successful match or submatch; e.g. (??{$re}) uses
2963it to free the inner regex.
2964
2965Note that failure backtracking rewinds the cursor position, while
2966success backtracking leaves it alone.
2967
2968A pattern is complete when the END op is executed, while a subpattern
2969such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2970ops trigger the "pop to last yes state if any, otherwise return true"
2971behaviour.
2972
2973A common convention in this function is to use A and B to refer to the two
2974subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2975the subpattern to be matched possibly multiple times, while B is the entire
2976rest of the pattern. Variable and state names reflect this convention.
2977
2978The states in the main switch are the union of ops and failure/success of
2979substates associated with with that op. For example, IFMATCH is the op
2980that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2981'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2982successfully matched A and IFMATCH_A_fail is a state saying that we have
2983just failed to match A. Resume states always come in pairs. The backtrack
2984state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2985at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2986on success or failure.
2987
2988The struct that holds a backtracking state is actually a big union, with
2989one variant for each major type of op. The variable st points to the
2990top-most backtrack struct. To make the code clearer, within each
2991block of code we #define ST to alias the relevant union.
2992
2993Here's a concrete example of a (vastly oversimplified) IFMATCH
2994implementation:
2995
2996 switch (state) {
2997 ....
2998
2999#define ST st->u.ifmatch
3000
3001 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3002 ST.foo = ...; // some state we wish to save
95b24440 3003 ...
bf1f174e
DM
3004 // push a yes backtrack state with a resume value of
3005 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3006 // first node of A:
4d5016e5 3007 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3008 // NOTREACHED
3009
3010 case IFMATCH_A: // we have successfully executed A; now continue with B
3011 next = B;
3012 bar = ST.foo; // do something with the preserved value
3013 break;
3014
3015 case IFMATCH_A_fail: // A failed, so the assertion failed
3016 ...; // do some housekeeping, then ...
3017 sayNO; // propagate the failure
3018
3019#undef ST
95b24440 3020
bf1f174e
DM
3021 ...
3022 }
95b24440 3023
bf1f174e
DM
3024For any old-timers reading this who are familiar with the old recursive
3025approach, the code above is equivalent to:
95b24440 3026
bf1f174e
DM
3027 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3028 {
3029 int foo = ...
95b24440 3030 ...
bf1f174e
DM
3031 if (regmatch(A)) {
3032 next = B;
3033 bar = foo;
3034 break;
95b24440 3035 }
bf1f174e
DM
3036 ...; // do some housekeeping, then ...
3037 sayNO; // propagate the failure
95b24440 3038 }
bf1f174e
DM
3039
3040The topmost backtrack state, pointed to by st, is usually free. If you
3041want to claim it, populate any ST.foo fields in it with values you wish to
3042save, then do one of
3043
4d5016e5
DM
3044 PUSH_STATE_GOTO(resume_state, node, newinput);
3045 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3046
3047which sets that backtrack state's resume value to 'resume_state', pushes a
3048new free entry to the top of the backtrack stack, then goes to 'node'.
3049On backtracking, the free slot is popped, and the saved state becomes the
3050new free state. An ST.foo field in this new top state can be temporarily
3051accessed to retrieve values, but once the main loop is re-entered, it
3052becomes available for reuse.
3053
3054Note that the depth of the backtrack stack constantly increases during the
3055left-to-right execution of the pattern, rather than going up and down with
3056the pattern nesting. For example the stack is at its maximum at Z at the
3057end of the pattern, rather than at X in the following:
3058
3059 /(((X)+)+)+....(Y)+....Z/
3060
3061The only exceptions to this are lookahead/behind assertions and the cut,
3062(?>A), which pop all the backtrack states associated with A before
3063continuing.
3064
486ec47a 3065Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3066PL_regmatch_state and st always point to the currently active state,
3067and PL_regmatch_slab points to the slab currently containing
3068PL_regmatch_state. The first time regmatch() is called, the first slab is
3069allocated, and is never freed until interpreter destruction. When the slab
3070is full, a new one is allocated and chained to the end. At exit from
3071regmatch(), slabs allocated since entry are freed.
3072
3073*/
95b24440 3074
40a82448 3075
5bc10b2c 3076#define DEBUG_STATE_pp(pp) \
265c4333 3077 DEBUG_STATE_r({ \
f2ed9b32 3078 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3079 PerlIO_printf(Perl_debug_log, \
5d458dd8 3080 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3081 depth*2, "", \
13d6edb4 3082 PL_reg_name[st->resume_state], \
5d458dd8
YO
3083 ((st==yes_state||st==mark_state) ? "[" : ""), \
3084 ((st==yes_state) ? "Y" : ""), \
3085 ((st==mark_state) ? "M" : ""), \
3086 ((st==yes_state||st==mark_state) ? "]" : "") \
3087 ); \
265c4333 3088 });
5bc10b2c 3089
40a82448 3090
3dab1dad 3091#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3092
3df15adc 3093#ifdef DEBUGGING
5bc10b2c 3094
ab3bbdeb 3095STATIC void
f2ed9b32 3096S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3097 const char *start, const char *end, const char *blurb)
3098{
efd26800 3099 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3100
3101 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3102
ab3bbdeb
YO
3103 if (!PL_colorset)
3104 reginitcolors();
3105 {
3106 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3107 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3108
f2ed9b32 3109 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3110 start, end - start, 60);
3111
3112 PerlIO_printf(Perl_debug_log,
3113 "%s%s REx%s %s against %s\n",
3114 PL_colors[4], blurb, PL_colors[5], s0, s1);
3115
f2ed9b32 3116 if (utf8_target||utf8_pat)
1de06328
YO
3117 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3118 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3119 utf8_pat && utf8_target ? " and " : "",
3120 utf8_target ? "string" : ""
ab3bbdeb
YO
3121 );
3122 }
3123}
3df15adc
YO
3124
3125STATIC void
786e8c11
YO
3126S_dump_exec_pos(pTHX_ const char *locinput,
3127 const regnode *scan,
3128 const char *loc_regeol,
3129 const char *loc_bostr,
3130 const char *loc_reg_starttry,
f2ed9b32 3131 const bool utf8_target)
07be1b83 3132{
786e8c11 3133 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3134 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3135 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3136 /* The part of the string before starttry has one color
3137 (pref0_len chars), between starttry and current
3138 position another one (pref_len - pref0_len chars),
3139 after the current position the third one.
3140 We assume that pref0_len <= pref_len, otherwise we
3141 decrease pref0_len. */
786e8c11
YO
3142 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3143 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3144 int pref0_len;
3145
7918f24d
NC
3146 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3147
f2ed9b32 3148 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3149 pref_len++;
786e8c11
YO
3150 pref0_len = pref_len - (locinput - loc_reg_starttry);
3151 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3152 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3153 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3154 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3155 l--;
3156 if (pref0_len < 0)
3157 pref0_len = 0;
3158 if (pref0_len > pref_len)
3159 pref0_len = pref_len;
3160 {
f2ed9b32 3161 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3162
ab3bbdeb 3163 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3164 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3165
ab3bbdeb 3166 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3167 (locinput - pref_len + pref0_len),
1de06328 3168 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3169
ab3bbdeb 3170 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3171 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3172
1de06328 3173 const STRLEN tlen=len0+len1+len2;
3df15adc 3174 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3175 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3176 (IV)(locinput - loc_bostr),
07be1b83 3177 len0, s0,
07be1b83 3178 len1, s1,
07be1b83 3179 (docolor ? "" : "> <"),
07be1b83 3180 len2, s2,
f9f4320a 3181 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3182 "");
3183 }
3184}
3df15adc 3185
07be1b83
YO
3186#endif
3187
0a4db386
YO
3188/* reg_check_named_buff_matched()
3189 * Checks to see if a named buffer has matched. The data array of
3190 * buffer numbers corresponding to the buffer is expected to reside
3191 * in the regexp->data->data array in the slot stored in the ARG() of
3192 * node involved. Note that this routine doesn't actually care about the
3193 * name, that information is not preserved from compilation to execution.
3194 * Returns the index of the leftmost defined buffer with the given name
3195 * or 0 if non of the buffers matched.
3196 */
3197STATIC I32
7918f24d
NC
3198S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3199{
0a4db386 3200 I32 n;
f8fc2ecf 3201 RXi_GET_DECL(rex,rexi);
ad64d0ec 3202 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3203 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3204
3205 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3206
0a4db386 3207 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3208 if ((I32)rex->lastparen >= nums[n] &&
3209 rex->offs[nums[n]].end != -1)
0a4db386
YO
3210 {
3211 return nums[n];
3212 }
3213 }
3214 return 0;
3215}
3216
2f554ef7
DM
3217
3218/* free all slabs above current one - called during LEAVE_SCOPE */
3219
3220STATIC void
3221S_clear_backtrack_stack(pTHX_ void *p)
3222{
3223 regmatch_slab *s = PL_regmatch_slab->next;
3224 PERL_UNUSED_ARG(p);
3225
3226 if (!s)
3227 return;
3228 PL_regmatch_slab->next = NULL;
3229 while (s) {
3230 regmatch_slab * const osl = s;
3231 s = s->next;
3232 Safefree(osl);
3233 }
3234}
c74f6de9 3235static bool
984e6dd1
DM
3236S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3237 U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
c74f6de9 3238{
79a2a0e8
KW
3239 /* This function determines if there are one or two characters that match
3240 * the first character of the passed-in EXACTish node <text_node>, and if
3241 * so, returns them in the passed-in pointers.
c74f6de9 3242 *
79a2a0e8
KW
3243 * If it determines that no possible character in the target string can
3244 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3245 * the first character in <text_node> requires UTF-8 to represent, and the
3246 * target string isn't in UTF-8.)
c74f6de9 3247 *
79a2a0e8
KW
3248 * If there are more than two characters that could match the beginning of
3249 * <text_node>, or if more context is required to determine a match or not,
3250 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3251 *
3252 * The motiviation behind this function is to allow the caller to set up
3253 * tight loops for matching. If <text_node> is of type EXACT, there is
3254 * only one possible character that can match its first character, and so
3255 * the situation is quite simple. But things get much more complicated if
3256 * folding is involved. It may be that the first character of an EXACTFish
3257 * node doesn't participate in any possible fold, e.g., punctuation, so it
3258 * can be matched only by itself. The vast majority of characters that are
3259 * in folds match just two things, their lower and upper-case equivalents.
3260 * But not all are like that; some have multiple possible matches, or match
3261 * sequences of more than one character. This function sorts all that out.
3262 *
3263 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3264 * loop of trying to match A*, we know we can't exit where the thing
3265 * following it isn't a B. And something can't be a B unless it is the
3266 * beginning of B. By putting a quick test for that beginning in a tight
3267 * loop, we can rule out things that can't possibly be B without having to
3268 * break out of the loop, thus avoiding work. Similarly, if A is a single
3269 * character, we can make a tight loop matching A*, using the outputs of
3270 * this function.
3271 *
3272 * If the target string to match isn't in UTF-8, and there aren't
3273 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3274 * the one or two possible octets (which are characters in this situation)
3275 * that can match. In all cases, if there is only one character that can
3276 * match, *<c1p> and *<c2p> will be identical.
3277 *
3278 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3279 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3280 * can match the beginning of <text_node>. They should be declared with at
3281 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3282 * undefined what these contain.) If one or both of the buffers are
3283 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3284 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3285 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3286 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3287 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3288
3289 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3290
ddb0d839
KW
3291 UV c1 = CHRTEST_NOT_A_CP_1;
3292 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3293 bool use_chrtest_void = FALSE;
3294
3295 /* Used when we have both utf8 input and utf8 output, to avoid converting
3296 * to/from code points */
3297 bool utf8_has_been_setup = FALSE;
3298
c74f6de9
KW
3299 dVAR;
3300
b4291290 3301 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3302
79a2a0e8
KW
3303 if (OP(text_node) == EXACT) {
3304
3305 /* In an exact node, only one thing can be matched, that first
3306 * character. If both the pat and the target are UTF-8, we can just
3307 * copy the input to the output, avoiding finding the code point of
3308 * that character */
984e6dd1 3309 if (!is_utf8_pat) {
79a2a0e8
KW
3310 c2 = c1 = *pat;
3311 }
3312 else if (utf8_target) {
3313 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3314 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3315 utf8_has_been_setup = TRUE;
3316 }
3317 else {
3318 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3319 }
79a2a0e8
KW
3320 }
3321 else /* an EXACTFish node */
984e6dd1 3322 if ((is_utf8_pat
79a2a0e8
KW
3323 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3324 pat + STR_LEN(text_node)))
984e6dd1 3325 || (!is_utf8_pat
79a2a0e8
KW
3326 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3327 pat + STR_LEN(text_node))))
3328 {
3329 /* Multi-character folds require more context to sort out. Also
3330 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3331 * handled outside this routine */
3332 use_chrtest_void = TRUE;
3333 }
3334 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
984e6dd1 3335 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
79a2a0e8
KW
3336 if (c1 > 256) {
3337 /* Load the folds hash, if not already done */
3338 SV** listp;
3339 if (! PL_utf8_foldclosures) {
3340 if (! PL_utf8_tofold) {
3341 U8 dummy[UTF8_MAXBYTES+1];
3342
3343 /* Force loading this by folding an above-Latin1 char */
3344 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3345 assert(PL_utf8_tofold); /* Verify that worked */
3346 }
3347 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3348 }
3349
3350 /* The fold closures data structure is a hash with the keys being
3351 * the UTF-8 of every character that is folded to, like 'k', and
3352 * the values each an array of all code points that fold to its
3353 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3354 * not included */
3355 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3356 (char *) pat,
3357 UTF8SKIP(pat),
3358 FALSE))))
3359 {
3360 /* Not found in the hash, therefore there are no folds
3361 * containing it, so there is only a single character that
3362 * could match */
3363 c2 = c1;
3364 }
3365 else { /* Does participate in folds */
3366 AV* list = (AV*) *listp;
3367 if (av_len(list) != 1) {
3368
3369 /* If there aren't exactly two folds to this, it is outside
3370 * the scope of this function */
3371 use_chrtest_void = TRUE;
3372 }
3373 else { /* There are two. Get them */
3374 SV** c_p = av_fetch(list, 0, FALSE);
3375 if (c_p == NULL) {
3376 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3377 }
3378 c1 = SvUV(*c_p);
3379
3380 c_p = av_fetch(list, 1, FALSE);
3381 if (c_p == NULL) {
3382 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3383 }
3384 c2 = SvUV(*c_p);
3385
3386 /* Folds that cross the 255/256 boundary are forbidden if
3387 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3388 * pattern character is above 256, and its only other match
3389 * is below 256, the only legal match will be to itself.
3390 * We have thrown away the original, so have to compute
3391 * which is the one above 255 */
3392 if ((c1 < 256) != (c2 < 256)) {
3393 if (OP(text_node) == EXACTFL
3394 || (OP(text_node) == EXACTFA
3395 && (isASCII(c1) || isASCII(c2))))
3396 {
3397 if (c1 < 256) {
3398 c1 = c2;
3399 }
3400 else {
3401 c2 = c1;
3402 }
3403 }
3404 }
3405 }
3406 }
3407 }
3408 else /* Here, c1 is < 255 */
3409 if (utf8_target
3410 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3411 && OP(text_node) != EXACTFL
3412 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3413 {
3414 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3415 * folds to this character in the pattern. All such cases except
3416 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3417 * involved in their folds, so are outside the scope of this
3418 * function */
3419 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3420 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3421 }
3422 else {
3423 use_chrtest_void = TRUE;
3424 }
c74f6de9
KW
3425 }
3426 else { /* Here nothing above Latin1 can fold to the pattern character */
3427 switch (OP(text_node)) {
3428
3429 case EXACTFL: /* /l rules */
79a2a0e8 3430 c2 = PL_fold_locale[c1];
c74f6de9
KW
3431 break;
3432
3433 case EXACTF:
3434 if (! utf8_target) { /* /d rules */
79a2a0e8 3435 c2 = PL_fold[c1];
c74f6de9
KW
3436 break;
3437 }
3438 /* FALLTHROUGH */
3439 /* /u rules for all these. This happens to work for
79a2a0e8 3440 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3441 case EXACTFA:
3442 case EXACTFU_TRICKYFOLD:
79a2a0e8 3443 case EXACTFU_SS:
c74f6de9 3444 case EXACTFU:
79a2a0e8 3445 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3446 break;
3447
878531d3
KW
3448 default:
3449 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3450 assert(0); /* NOTREACHED */
c74f6de9
KW
3451 }
3452 }
3453 }
79a2a0e8
KW
3454
3455 /* Here have figured things out. Set up the returns */
3456 if (use_chrtest_void) {
3457 *c2p = *c1p = CHRTEST_VOID;
3458 }
3459 else if (utf8_target) {
3460 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3461 uvchr_to_utf8(c1_utf8, c1);
3462 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3463 }
c74f6de9 3464
79a2a0e8
KW
3465 /* Invariants are stored in both the utf8 and byte outputs; Use
3466 * negative numbers otherwise for the byte ones. Make sure that the
3467 * byte ones are the same iff the utf8 ones are the same */
3468 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3469 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3470 ? *c2_utf8
3471 : (c1 == c2)
3472 ? CHRTEST_NOT_A_CP_1
3473 : CHRTEST_NOT_A_CP_2;
3474 }
3475 else if (c1 > 255) {
3476 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3477 can represent */
3478 return FALSE;
3479 }
c74f6de9 3480
79a2a0e8
KW
3481 *c1p = *c2p = c2; /* c2 is the only representable value */
3482 }
3483 else { /* c1 is representable; see about c2 */
3484 *c1p = c1;
3485 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3486 }
2f554ef7 3487
c74f6de9
KW
3488 return TRUE;
3489}
2f554ef7 3490
f73aaa43
DM
3491/* returns -1 on failure, $+[0] on success */
3492STATIC I32
3493S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3494{
a35a87e7 3495#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3496 dMY_CXT;
3497#endif
27da23d5 3498 dVAR;
eb578fdb 3499 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3500 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3501 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3502 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3503 RXi_GET_DECL(rex,rexi);
2f554ef7 3504 I32 oldsave;
5d9a96ca 3505 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3506 regmatch_state *st;
5d9a96ca 3507 /* cache heavy used fields of st in registers */
eb578fdb
KW
3508 regnode *scan;
3509 regnode *next;
3510 U32 n = 0; /* general value; init to avoid compiler warning */
3511 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3512 char *locinput = startpos;
4d5016e5 3513 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3514 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3515
b69b0499 3516 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3517 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3518 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3519 const U32 max_nochange_depth =
3520 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3521 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3522 regmatch_state *yes_state = NULL; /* state to pop to on success of
3523 subpattern */
e2e6a0f1
YO
3524 /* mark_state piggy backs on the yes_state logic so that when we unwind
3525 the stack on success we can update the mark_state as we go */
3526 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3527 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3528 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3529 U32 state_num;
5d458dd8
YO
3530 bool no_final = 0; /* prevent failure from backtracking? */
3531 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3532 char *startpoint = locinput;
5d458dd8
YO
3533 SV *popmark = NULL; /* are we looking for a mark? */
3534 SV *sv_commit = NULL; /* last mark name seen in failure */
3535 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3536 during a successful match */
5d458dd8
YO
3537 U32 lastopen = 0; /* last open we saw */
3538 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3539 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3540 /* these three flags are set by various ops to signal information to
3541 * the very next op. They have a useful lifetime of exactly one loop
3542 * iteration, and are not preserved or restored by state pushes/pops
3543 */
3544 bool sw = 0; /* the condition value in (?(cond)a|b) */
3545 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3546 int logical = 0; /* the following EVAL is:
3547 0: (?{...})
3548 1: (?(?{...})X|Y)
3549 2: (??{...})
3550 or the following IFMATCH/UNLESSM is:
3551 false: plain (?=foo)
3552 true: used as a condition: (?(?=foo))
3553 */
81ed78b2
DM
3554 PAD* last_pad = NULL;
3555 dMULTICALL;
3556 I32 gimme = G_SCALAR;
3557 CV *caller_cv = NULL; /* who called us */
3558 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3559 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3560 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
3561 int to_complement; /* Invert the result? */
3562 _char_class_number classnum;
984e6dd1 3563 bool is_utf8_pat = reginfo->is_utf8_pat;
81ed78b2 3564
95b24440 3565#ifdef DEBUGGING
e68ec53f 3566 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3567#endif
3568
81ed78b2
DM
3569 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3570 multicall_oldcatch = 0;
3571 multicall_cv = NULL;
3572 cx = NULL;
4f8dbb2d
JL
3573 PERL_UNUSED_VAR(multicall_cop);
3574 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3575
3576
7918f24d
NC
3577 PERL_ARGS_ASSERT_REGMATCH;
3578
3b57cd43 3579 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3580 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3581 }));
5d9a96ca
DM
3582 /* on first ever call to regmatch, allocate first slab */
3583 if (!PL_regmatch_slab) {
3584 Newx(PL_regmatch_slab, 1, regmatch_slab);
3585 PL_regmatch_slab->prev = NULL;
3586 PL_regmatch_slab->next = NULL;
86545054 3587 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
5d9a96ca
DM
3588 }
3589
2f554ef7
DM
3590 oldsave = PL_savestack_ix;
3591 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3592 SAVEVPTR(PL_regmatch_slab);
3593 SAVEVPTR(PL_regmatch_state);
5d9a96ca
DM
3594
3595 /* grab next free state slot */
3596 st = ++PL_regmatch_state;
86545054 3597 if (st > SLAB_LAST(PL_regmatch_slab))
5d9a96ca
DM
3598 st = PL_regmatch_state = S_push_slab(aTHX);
3599
d6a28714 3600 /* Note that nextchr is a byte even in UTF */
7016d6eb 3601 SET_nextchr;
d6a28714
JH
3602 scan = prog;
3603 while (scan != NULL) {
8ba1375e 3604
a3621e74 3605 DEBUG_EXECUTE_r( {
6136c704 3606 SV * const prop = sv_newmortal();
1de06328 3607 regnode *rnext=regnext(scan);
f2ed9b32 3608 DUMP_EXEC_POS( locinput, scan, utf8_target );
32fc9b6a 3609 regprop(rex, prop, scan);
07be1b83
YO
3610
3611 PerlIO_printf(Perl_debug_log,
3612 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3613 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3614 SvPVX_const(prop),
1de06328 3615 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3616 0 : (IV)(rnext - rexi->program));
2a782b5b 3617 });
d6a28714
JH
3618
3619 next = scan + NEXT_OFF(scan);
3620 if (next == scan)
3621 next = NULL;
40a82448 3622 state_num = OP(scan);
d6a28714 3623
40a82448 3624 reenter_switch:
3018b823 3625 to_complement = 0;
34a81e2b 3626
7016d6eb 3627 SET_nextchr;
e6ca698c 3628 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 3629
40a82448 3630 switch (state_num) {
3c0563b9 3631 case BOL: /* /^../ */
7fba1cd6 3632 if (locinput == PL_bostr)
d6a28714 3633 {
3b0527fe 3634 /* reginfo->till = reginfo->bol; */
b8c5462f
JH
3635 break;
3636 }
d6a28714 3637 sayNO;
3c0563b9
DM
3638
3639 case MBOL: /* /^../m */
12d33761 3640 if (locinput == PL_bostr ||
7016d6eb 3641 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3642 {
b8c5462f
JH
3643 break;
3644 }
d6a28714 3645 sayNO;
3c0563b9
DM
3646
3647 case SBOL: /* /^../s */
c2a73568 3648 if (locinput == PL_bostr)
b8c5462f 3649 break;
d6a28714 3650 sayNO;
3c0563b9
DM
3651
3652 case GPOS: /* \G */
3b0527fe 3653 if (locinput == reginfo->ganch)
d6a28714
JH
3654 break;
3655 sayNO;
ee9b8eae 3656
3c0563b9 3657 case KEEPS: /* \K */
ee9b8eae 3658 /* update the startpoint */
b93070ed 3659 st->u.keeper.val = rex->offs[0].start;
b93070ed 3660 rex->offs[0].start = locinput - PL_bostr;
4d5016e5 3661 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
878531d3 3662 assert(0); /*NOTREACHED*/
ee9b8eae
YO
3663 case KEEPS_next_fail:
3664 /* rollback the start point change */
b93070ed 3665 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 3666 sayNO_SILENT;
878531d3 3667 assert(0); /*NOTREACHED*/
3c0563b9
DM
3668
3669 case EOL: /* /..$/ */
d6a28714 3670 goto seol;
3c0563b9
DM
3671
3672 case MEOL: /* /..$/m */
7016d6eb 3673 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3674 sayNO;
b8c5462f 3675 break;
3c0563b9
DM
3676
3677 case SEOL: /* /..$/s */
d6a28714 3678 seol:
7016d6eb 3679 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 3680 sayNO;
d6a28714 3681 if (PL_regeol - locinput > 1)
b8c5462f 3682 sayNO;
b8c5462f 3683 break;
3c0563b9
DM
3684
3685 case EOS: /* \z */
7016d6eb 3686 if (!NEXTCHR_IS_EOS)
b8c5462f 3687 sayNO;
d6a28714 3688 break;
3c0563b9
DM
3689
3690 case SANY: /* /./s */
7016d6eb 3691 if (NEXTCHR_IS_EOS)
4633a7c4 3692 sayNO;
28b98f76 3693 goto increment_locinput;
3c0563b9
DM
3694
3695 case CANY: /* \C */
7016d6eb 3696 if (NEXTCHR_IS_EOS)
f33976b4 3697 sayNO;
3640db6b 3698 locinput++;
a0d0e21e 3699 break;
3c0563b9
DM
3700
3701 case REG_ANY: /* /./ */
7016d6eb 3702 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 3703 sayNO;
28b98f76
DM
3704 goto increment_locinput;
3705
166ba7cd
DM
3706
3707#undef ST
3708#define ST st->u.trie
3c0563b9 3709 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
3710 /* In this case the charclass data is available inline so
3711 we can fail fast without a lot of extra overhead.
3712 */
7016d6eb 3713 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
3714 DEBUG_EXECUTE_r(
3715 PerlIO_printf(Perl_debug_log,
3716 "%*s %sfailed to match trie start class...%s\n",
3717 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3718 );
3719 sayNO_SILENT;
118e2215 3720 assert(0); /* NOTREACHED */
786e8c11
YO
3721 }
3722 /* FALL THROUGH */
3c0563b9 3723 case TRIE: /* (ab|cd) */
2e64971a
DM
3724 /* the basic plan of execution of the trie is:
3725 * At the beginning, run though all the states, and
3726 * find the longest-matching word. Also remember the position
3727 * of the shortest matching word. For example, this pattern:
3728 * 1 2 3 4 5
3729 * ab|a|x|abcd|abc
3730 * when matched against the string "abcde", will generate
3731 * accept states for all words except 3, with the longest
895cc420 3732 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
3733 * the position being after char 1 of the string).
3734 *
3735 * Then for each matching word, in word order (i.e. 1,2,4,5),
3736 * we run the remainder of the pattern; on each try setting
3737 * the current position to the character following the word,
3738 * returning to try the next word on failure.
3739 *
3740 * We avoid having to build a list of words at runtime by
3741 * using a compile-time structure, wordinfo[].prev, which
3742 * gives, for each word, the previous accepting word (if any).
3743 * In the case above it would contain the mappings 1->2, 2->0,
3744 * 3->0, 4->5, 5->1. We can use this table to generate, from
3745 * the longest word (4 above), a list of all words, by
3746 * following the list of prev pointers; this gives us the
3747 * unordered list 4,5,1,2. Then given the current word we have
3748 * just tried, we can go through the list and find the
3749 * next-biggest word to try (so if we just failed on word 2,
3750 * the next in the list is 4).
3751 *
3752 * Since at runtime we don't record the matching position in
3753 * the string for each word, we have to work that out for
3754 * each word we're about to process. The wordinfo table holds
3755 * the character length of each word; given that we recorded
3756 * at the start: the position of the shortest word and its
3757 * length in chars, we just need to move the pointer the
3758 * difference between the two char lengths. Depending on
3759 * Unicode status and folding, that's cheap or expensive.
3760 *
3761 * This algorithm is optimised for the case where are only a
3762 * small number of accept states, i.e. 0,1, or maybe 2.
3763 * With lots of accepts states, and having to try all of them,
3764 * it becomes quadratic on number of accept states to find all
3765 * the next words.
3766 */
3767
3dab1dad 3768 {
07be1b83 3769 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 3770 DECL_TRIE_TYPE(scan);
3dab1dad
YO
3771
3772 /* what trie are we using right now */
be8e71aa 3773 reg_trie_data * const trie
f8fc2ecf 3774 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 3775 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 3776 U32 state = trie->startstate;
166ba7cd 3777
7016d6eb
DM
3778 if ( trie->bitmap
3779 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3780 {
3dab1dad
YO
3781 if (trie->states[ state ].wordnum) {
3782 DEBUG_EXECUTE_r(
3783 PerlIO_printf(Perl_debug_log,
3784 "%*s %smatched empty string...%s\n",
5bc10b2c 3785 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 3786 );
20dbff7c
YO
3787 if (!trie->jump)
3788 break;
3dab1dad
YO
3789 } else {
3790 DEBUG_EXECUTE_r(
3791 PerlIO_printf(Perl_debug_log,
786e8c11 3792 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 3793 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
3794 );
3795 sayNO_SILENT;
3796 }
3797 }
166ba7cd 3798
786e8c11
YO
3799 {
3800 U8 *uc = ( U8* )locinput;
3801
3802 STRLEN len = 0;
3803 STRLEN foldlen = 0;
3804 U8 *uscan = (U8*)NULL;
786e8c11 3805 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
3806 U32 charcount = 0; /* how many input chars we have matched */
3807 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 3808
786e8c11 3809 ST.jump = trie->jump;
786e8c11 3810 ST.me = scan;
2e64971a
DM
3811 ST.firstpos = NULL;
3812 ST.longfold = FALSE; /* char longer if folded => it's harder */
3813 ST.nextword = 0;
3814
3815 /* fully traverse the TRIE; note the position of the
3816 shortest accept state and the wordnum of the longest
3817 accept state */
07be1b83 3818
a3621e74 3819 while ( state && uc <= (U8*)PL_regeol ) {
786e8c11 3820 U32 base = trie->states[ state ].trans.base;
f9f4320a 3821 UV uvc = 0;
acb909b4 3822 U16 charid = 0;
2e64971a
DM
3823 U16 wordnum;
3824 wordnum = trie->states[ state ].wordnum;
3825
3826 if (wordnum) { /* it's an accept state */
3827 if (!accepted) {
3828 accepted = 1;
3829 /* record first match position */
3830 if (ST.longfold) {
3831 ST.firstpos = (U8*)locinput;
3832 ST.firstchars = 0;
5b47454d 3833 }
2e64971a
DM
3834 else {
3835 ST.firstpos = uc;
3836 ST.firstchars = charcount;
3837 }
3838 }
3839 if (!ST.nextword || wordnum < ST.nextword)
3840 ST.nextword = wordnum;
3841 ST.topword = wordnum;
786e8c11 3842 }
a3621e74 3843
07be1b83 3844 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 3845 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 3846 PerlIO_printf( Perl_debug_log,
2e64971a 3847 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 3848 2+depth * 2, "", PL_colors[4],
2e64971a 3849 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 3850 });
a3621e74 3851
2e64971a 3852 /* read a char and goto next state */
7016d6eb 3853 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
6dd2be57 3854 I32 offset;
55eed653
NC
3855 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3856 uscan, len, uvc, charid, foldlen,
3857 foldbuf, uniflags);
2e64971a
DM
3858 charcount++;
3859 if (foldlen>0)
3860 ST.longfold = TRUE;
5b47454d 3861 if (charid &&
6dd2be57
DM
3862 ( ((offset =
3863 base + charid - 1 - trie->uniquecharcount)) >= 0)
3864
3865 && ((U32)offset < trie->lasttrans)
3866 && trie->trans[offset].check == state)
5b47454d 3867 {
6dd2be57 3868 state = trie->trans[offset].next;
5b47454d
DM
3869 }
3870 else {
3871 state = 0;
3872 }
3873 uc += len;
3874
3875 }
3876 else {
a3621e74
YO
3877 state = 0;
3878 }
3879 DEBUG_TRIE_EXECUTE_r(
e4584336 3880 PerlIO_printf( Perl_debug_log,
786e8c11 3881 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 3882 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
3883 );
3884 }
2e64971a 3885 if (!accepted)
a3621e74 3886 sayNO;
a3621e74 3887
2e64971a
DM
3888 /* calculate total number of accept states */
3889 {
3890 U16 w = ST.topword;
3891 accepted = 0;
3892 while (w) {
3893 w = trie->wordinfo[w].prev;
3894 accepted++;
3895 }
3896 ST.accepted = accepted;
3897 }
3898
166ba7cd
DM
3899 DEBUG_EXECUTE_r(
3900 PerlIO_printf( Perl_debug_log,
3901 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 3902 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
3903 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3904 );
2e64971a 3905 goto trie_first_try; /* jump into the fail handler */
786e8c11 3906 }}
118e2215 3907 assert(0); /* NOTREACHED */
2e64971a
DM
3908
3909 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
3910 {
3911 U8 *uc;
fae667d5
YO
3912 if ( ST.jump) {
3913 REGCP_UNWIND(ST.cp);
a8d1f4b4 3914 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 3915 }
2e64971a
DM
3916 if (!--ST.accepted) {
3917 DEBUG_EXECUTE_r({
3918 PerlIO_printf( Perl_debug_log,
3919 "%*s %sTRIE failed...%s\n",
3920 REPORT_CODE_OFF+depth*2, "",
3921 PL_colors[4],
3922 PL_colors[5] );
3923 });
3924 sayNO_SILENT;
3925 }
3926 {
3927 /* Find next-highest word to process. Note that this code
3928 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
3929 U16 min = 0;
3930 U16 word;
3931 U16 const nextword = ST.nextword;
3932 reg_trie_wordinfo * const wordinfo
2e64971a
DM
3933 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3934 for (word=ST.topword; word; word=wordinfo[word].prev) {
3935 if (word > nextword && (!min || word < min))
3936 min = word;
3937 }
3938 ST.nextword = min;
3939 }
3940
fae667d5 3941 trie_first_try:
5d458dd8
YO
3942 if (do_cutgroup) {
3943 do_cutgroup = 0;
3944 no_final = 0;
3945 }
fae667d5
YO
3946
3947 if ( ST.jump) {
b93070ed 3948 ST.lastparen = rex->lastparen;
f6033a9d 3949 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 3950 REGCP_SET(ST.cp);
2e64971a 3951 }
a3621e74 3952
2e64971a 3953 /* find start char of end of current word */
166ba7cd 3954 {
2e64971a 3955 U32 chars; /* how many chars to skip */
2e64971a
DM
3956 reg_trie_data * const trie
3957 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3958
3959 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3960 >= ST.firstchars);
3961 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3962 - ST.firstchars;
a059a757 3963 uc = ST.firstpos;
2e64971a
DM
3964
3965 if (ST.longfold) {
3966 /* the hard option - fold each char in turn and find
3967 * its folded length (which may be different */
3968 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3969 STRLEN foldlen;
3970 STRLEN len;
d9a396a3 3971 UV uvc;
2e64971a
DM
3972 U8 *uscan;
3973
3974 while (chars) {
f2ed9b32 3975 if (utf8_target) {
2e64971a
DM
3976 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3977 uniflags);
3978 uc += len;
3979 }
3980 else {
3981 uvc = *uc;
3982 uc++;
3983 }
3984 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3985 uscan = foldbuf;
3986 while (foldlen) {
3987 if (!--chars)
3988 break;
3989 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3990 uniflags);
3991 uscan += len;
3992 foldlen -= len;
3993 }
3994 }
a3621e74 3995 }
2e64971a 3996 else {
f2ed9b32 3997 if (utf8_target)
2e64971a
DM
3998 while (chars--)
3999 uc += UTF8SKIP(uc);
4000 else
4001 uc += chars;
4002 }
2e64971a 4003 }
166ba7cd 4004
6603fe3e
DM
4005 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4006 ? ST.jump[ST.nextword]
4007 : NEXT_OFF(ST.me));
166ba7cd 4008
2e64971a
DM
4009 DEBUG_EXECUTE_r({
4010 PerlIO_printf( Perl_debug_log,
4011 "%*s %sTRIE matched word #%d, continuing%s\n",
4012 REPORT_CODE_OFF+depth*2, "",
4013 PL_colors[4],
4014 ST.nextword,
4015 PL_colors[5]
4016 );
4017 });
4018
4019 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4020 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 4021 assert(0); /* NOTREACHED */
166ba7cd 4022 }
2e64971a
DM
4023 /* only one choice left - just continue */
4024 DEBUG_EXECUTE_r({
4025 AV *const trie_words
4026 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4027 SV ** const tmp = av_fetch( trie_words,
4028 ST.nextword-1, 0 );
4029 SV *sv= tmp ? sv_newmortal() : NULL;
4030
4031 PerlIO_printf( Perl_debug_log,
4032 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4033 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4034 ST.nextword,
4035 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4036 PL_colors[0], PL_colors[1],
c89df6cf 4037 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4038 )
4039 : "not compiled under -Dr",
4040 PL_colors[5] );
4041 });
4042
a059a757 4043 locinput = (char*)uc;
2e64971a 4044 continue; /* execute rest of RE */
118e2215 4045 assert(0); /* NOTREACHED */
a059a757 4046 }
166ba7cd
DM
4047#undef ST
4048
3c0563b9 4049 case EXACT: { /* /abc/ */
95b24440 4050 char *s = STRING(scan);
24d3c4a9 4051 ln = STR_LEN(scan);
984e6dd1 4052 if (utf8_target != is_utf8_pat) {
bc517b45 4053 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4054 char *l = locinput;
24d3c4a9 4055 const char * const e = s + ln;
a72c7584 4056
f2ed9b32 4057 if (utf8_target) {
e6a3850e
KW
4058 /* The target is utf8, the pattern is not utf8.
4059 * Above-Latin1 code points can't match the pattern;
4060 * invariants match exactly, and the other Latin1 ones need
4061 * to be downgraded to a single byte in order to do the
4062 * comparison. (If we could be confident that the target
4063 * is not malformed, this could be refactored to have fewer
4064 * tests by just assuming that if the first bytes match, it
4065 * is an invariant, but there are tests in the test suite
4066 * dealing with (??{...}) which violate this) */
1aa99e6b 4067 while (s < e) {
5ac65bff 4068 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
e6a3850e
KW
4069 sayNO;
4070 }
4071 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4072 if (*l != *s) {
4073 sayNO;
4074 }
4075 l++;
4076 }
4077 else {
4078 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4079 sayNO;
4080 }
4081 l += 2;
4082 }
4083 s++;
1aa99e6b 4084 }
5ff6fc6d
JH
4085 }
4086 else {
4087 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4088 while (s < e) {
e6a3850e
KW
4089 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4090 {
4091 sayNO;
4092 }
4093 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4094 if (*s != *l) {
4095 sayNO;
4096 }
4097 s++;
4098 }
4099 else {
4100 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4101 sayNO;
4102 }
4103 s += 2;
4104 }
4105 l++;
1aa99e6b 4106 }
5ff6fc6d 4107 }
1aa99e6b 4108 locinput = l;
1aa99e6b 4109 }
5ac65bff
KW
4110 else {
4111 /* The target and the pattern have the same utf8ness. */
4112 /* Inline the first character, for speed. */
4113 if (PL_regeol - locinput < ln
4114 || UCHARAT(s) != nextchr
4115 || (ln > 1 && memNE(s, locinput, ln)))
4116 {
4117 sayNO;
4118 }
4119 locinput += ln;
4120 }
d6a28714 4121 break;
95b24440 4122 }
7016d6eb 4123
3c0563b9 4124 case EXACTFL: { /* /abc/il */
a932d541 4125 re_fold_t folder;
9a5a5549
KW
4126 const U8 * fold_array;
4127 const char * s;
d513472c 4128 U32 fold_utf8_flags;
9a5a5549 4129
272d35c9 4130 RX_MATCH_TAINTED_on(reginfo->prog);
f67f9e53
KW
4131 folder = foldEQ_locale;
4132 fold_array = PL_fold_locale;
17580e7a 4133 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
9a5a5549
KW
4134 goto do_exactf;
4135
3c0563b9
DM
4136 case EXACTFU_SS: /* /\x{df}/iu */
4137 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4138 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4139 folder = foldEQ_latin1;
4140 fold_array = PL_fold_latin1;
984e6dd1 4141 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4142 goto do_exactf;
4143
3c0563b9 4144 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4145 folder = foldEQ_latin1;
4146 fold_array = PL_fold_latin1;
57014d77 4147 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4148 goto do_exactf;
4149
3c0563b9 4150 case EXACTF: /* /abc/i */
9a5a5549
KW
4151 folder = foldEQ;
4152 fold_array = PL_fold;
62bf7766 4153 fold_utf8_flags = 0;
9a5a5549
KW
4154
4155 do_exactf:
4156 s = STRING(scan);
24d3c4a9 4157 ln = STR_LEN(scan);
d6a28714 4158
984e6dd1 4159 if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
3c760661
KW
4160 /* Either target or the pattern are utf8, or has the issue where
4161 * the fold lengths may differ. */
be8e71aa 4162 const char * const l = locinput;
d07ddd77 4163 char *e = PL_regeol;
bc517b45 4164
984e6dd1 4165 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 4166 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4167 {
4168 sayNO;
5486206c 4169 }
d07ddd77 4170 locinput = e;
d07ddd77 4171 break;
a0ed51b3 4172 }
d6a28714 4173
0a138b74 4174 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4175 if (UCHARAT(s) != nextchr
4176 && !NEXTCHR_IS_EOS
4177 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4178 {
a0ed51b3 4179 sayNO;
9a5a5549 4180 }
24d3c4a9 4181 if (PL_regeol - locinput < ln)
b8c5462f 4182 sayNO;
9a5a5549 4183 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4184 sayNO;
24d3c4a9 4185 locinput += ln;
a0d0e21e 4186 break;
9a5a5549 4187 }
63ac0dad
KW
4188
4189 /* XXX Could improve efficiency by separating these all out using a
4190 * macro or in-line function. At that point regcomp.c would no longer
4191 * have to set the FLAGS fields of these */
3c0563b9
DM
4192 case BOUNDL: /* /\b/l */
4193 case NBOUNDL: /* /\B/l */
272d35c9 4194 RX_MATCH_TAINTED_on(reginfo->prog);
b2680017 4195 /* FALL THROUGH */
3c0563b9
DM
4196 case BOUND: /* /\b/ */
4197 case BOUNDU: /* /\b/u */
4198 case BOUNDA: /* /\b/a */
4199 case NBOUND: /* /\B/ */
4200 case NBOUNDU: /* /\B/u */
4201 case NBOUNDA: /* /\B/a */
b2680017 4202 /* was last char in word? */
f2e96b5d
KW
4203 if (utf8_target
4204 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4205 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4206 {
b2680017
YO
4207 if (locinput == PL_bostr)
4208 ln = '\n';
4209 else {
4210 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4211
4212 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4213 }
63ac0dad 4214 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
0eb30aeb 4215 ln = isWORDCHAR_uni(ln);
7016d6eb
DM
4216 if (NEXTCHR_IS_EOS)
4217 n = 0;
4218 else {
4219 LOAD_UTF8_CHARCLASS_ALNUM();
03940dc2 4220 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
7016d6eb
DM
4221 utf8_target);
4222 }
b2680017
YO
4223 }
4224 else {
0eb30aeb
KW
4225 ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4226 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
b2680017
YO
4227 }
4228 }
4229 else {
cfaf538b
KW
4230
4231 /* Here the string isn't utf8, or is utf8 and only ascii
4232 * characters are to match \w. In the latter case looking at
4233 * the byte just prior to the current one may be just the final
4234 * byte of a multi-byte character. This is ok. There are two
4235 * cases:
4236 * 1) it is a single byte character, and then the test is doing
4237 * just what it's supposed to.
4238 * 2) it is a multi-byte character, in which case the final
4239 * byte is never mistakable for ASCII, and so the test
4240 * will say it is not a word character, which is the
4241 * correct answer. */
b2680017
YO
4242 ln = (locinput != PL_bostr) ?
4243 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4244 switch (FLAGS(scan)) {
4245 case REGEX_UNICODE_CHARSET:
4246 ln = isWORDCHAR_L1(ln);
7016d6eb 4247 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4248 break;
4249 case REGEX_LOCALE_CHARSET:
0eb30aeb
KW
4250 ln = isWORDCHAR_LC(ln);
4251 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
63ac0dad
KW
4252 break;
4253 case REGEX_DEPENDS_CHARSET:
0eb30aeb
KW
4254 ln = isWORDCHAR(ln);
4255 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
63ac0dad 4256 break;
cfaf538b 4257 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4258 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4259 ln = isWORDCHAR_A(ln);
7016d6eb 4260 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4261 break;
63ac0dad
KW
4262 default:
4263 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4264 break;
b2680017
YO
4265 }
4266 }
63ac0dad
KW
4267 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4268 * regcomp.sym */
4269 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4270 sayNO;
4271 break;
3c0563b9 4272
3c0563b9 4273 case ANYOF: /* /[abc]/ */
954a2af6 4274 case ANYOF_WARN_SUPER:
7016d6eb
DM
4275 if (NEXTCHR_IS_EOS)
4276 sayNO;
e0193e47 4277 if (utf8_target) {
635cd5d4 4278 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
09b08e9b 4279 sayNO;
635cd5d4 4280 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
4281 }
4282 else {
20ed0b26 4283 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4284 sayNO;
3640db6b 4285 locinput++;
e0f9d4a8 4286 }
b8c5462f 4287 break;
3c0563b9 4288
3018b823
KW
4289 /* The argument (FLAGS) to all the POSIX node types is the class number
4290 * */
ee9a90b8 4291
3018b823
KW
4292 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4293 to_complement = 1;
4294 /* FALLTHROUGH */
4295
4296 case POSIXL: /* \w or [:punct:] etc. under /l */
4297 if (NEXTCHR_IS_EOS)
bedac28b 4298 sayNO;
bedac28b 4299
3018b823
KW
4300 /* The locale hasn't influenced the outcome before this, so defer
4301 * tainting until now */
272d35c9 4302 RX_MATCH_TAINTED_on(reginfo->prog);
3018b823
KW
4303
4304 /* Use isFOO_lc() for characters within Latin1. (Note that
4305 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4306 * wouldn't be invariant) */
4307 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 4308 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
4309 sayNO;
4310 }
4311 }
3018b823
KW
4312 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4313 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
eb4e9c04 4314 (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
3018b823
KW
4315 *(locinput + 1))))))
4316 {
bedac28b 4317 sayNO;
3018b823 4318 }
bedac28b 4319 }
3018b823
KW
4320 else { /* Here, must be an above Latin-1 code point */
4321 goto utf8_posix_not_eos;
bedac28b 4322 }
3018b823
KW
4323
4324 /* Here, must be utf8 */
4325 locinput += UTF8SKIP(locinput);
bedac28b
KW
4326 break;
4327
3018b823
KW
4328 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4329 to_complement = 1;
4330 /* FALLTHROUGH */
4331
4332 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 4333 if (utf8_target) {
3018b823 4334 goto utf8_posix;
bedac28b 4335 }
3018b823
KW
4336 goto posixa;
4337
4338 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 4339
3018b823 4340 if (NEXTCHR_IS_EOS) {
bedac28b
KW
4341 sayNO;
4342 }
bedac28b 4343
3018b823
KW
4344 /* All UTF-8 variants match */
4345 if (! UTF8_IS_INVARIANT(nextchr)) {
4346 goto increment_locinput;
bedac28b 4347 }
ee9a90b8 4348
3018b823
KW
4349 to_complement = 1;
4350 /* FALLTHROUGH */
4351
4352 case POSIXA: /* \w or [:punct:] etc. under /a */
4353
4354 posixa:
4355 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4356 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4357 * character is a single byte */
20d0b1e9 4358
3018b823
KW
4359 if (NEXTCHR_IS_EOS
4360 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4361 FLAGS(scan)))))
4362 {
0658cdde
KW
4363 sayNO;
4364 }
3018b823
KW
4365
4366 /* Here we are either not in utf8, or we matched a utf8-invariant,
4367 * so the next char is the next byte */
3640db6b 4368 locinput++;
0658cdde 4369 break;
3c0563b9 4370
3018b823
KW
4371 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4372 to_complement = 1;
4373 /* FALLTHROUGH */
4374
4375 case POSIXU: /* \w or [:punct:] etc. under /u */
4376 utf8_posix:
4377 if (NEXTCHR_IS_EOS) {
0658cdde
KW
4378 sayNO;
4379 }
3018b823
KW
4380 utf8_posix_not_eos:
4381
4382 /* Use _generic_isCC() for characters within Latin1. (Note that
4383 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4384 * wouldn't be invariant) */
4385 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4386 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4387 FLAGS(scan)))))
4388 {
4389 sayNO;
4390 }
4391 locinput++;
4392 }
4393 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4394 if (! (to_complement
4395 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4396 *(locinput + 1)),
4397 FLAGS(scan)))))
4398 {
4399 sayNO;
4400 }
4401 locinput += 2;
4402 }
4403 else { /* Handle above Latin-1 code points */
4404 classnum = (_char_class_number) FLAGS(scan);
4405 if (classnum < _FIRST_NON_SWASH_CC) {
4406
4407 /* Here, uses a swash to find such code points. Load if if
4408 * not done already */
4409 if (! PL_utf8_swash_ptrs[classnum]) {
4410 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4411 PL_utf8_swash_ptrs[classnum]
4412 = _core_swash_init("utf8",
4413 swash_property_names[classnum],
4414 &PL_sv_undef, 1, 0, NULL, &flags);
4415 }
4416 if (! (to_complement
4417 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4418 (U8 *) locinput, TRUE))))
4419 {
4420 sayNO;
4421 }
4422 }
4423 else { /* Here, uses macros to find above Latin-1 code points */
4424 switch (classnum) {
4425 case _CC_ENUM_SPACE: /* XXX would require separate
4426 code if we revert the change
4427 of \v matching this */
4428 case _CC_ENUM_PSXSPC:
4429 if (! (to_complement
4430 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4431 {
4432 sayNO;
4433 }
4434 break;
4435 case _CC_ENUM_BLANK:
4436 if (! (to_complement
4437 ^ cBOOL(is_HORIZWS_high(locinput))))
4438 {
4439 sayNO;
4440 }
4441 break;
4442 case _CC_ENUM_XDIGIT:
4443 if (! (to_complement
4444 ^ cBOOL(is_XDIGIT_high(locinput))))
4445 {
4446 sayNO;
4447 }
4448 break;
4449 case _CC_ENUM_VERTSPACE:
4450 if (! (to_complement
4451 ^ cBOOL(is_VERTWS_high(locinput))))
4452 {
4453 sayNO;
4454 }
4455 break;
4456 default: /* The rest, e.g. [:cntrl:], can't match
4457 above Latin1 */
4458 if (! to_complement) {
4459 sayNO;
4460 }
4461 break;
4462 }
4463 }
4464 locinput += UTF8SKIP(locinput);
4465 }
4466 break;
0658cdde 4467
37e2e78e
KW
4468 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4469 a Unicode extended Grapheme Cluster */
4470 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4471 extended Grapheme Cluster is:
4472
7aee35ff
KW
4473 CR LF
4474 | Prepend* Begin Extend*
4475 | .
37e2e78e 4476
7aee35ff
KW
4477 Begin is: ( Special_Begin | ! Control )
4478 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4479 Extend is: ( Grapheme_Extend | Spacing_Mark )
4480 Control is: [ GCB_Control | CR | LF ]
4481 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4482
27d4fc33
KW
4483 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4484 we can rewrite
4485
4486 Begin is ( Regular_Begin + Special Begin )
4487
4488 It turns out that 98.4% of all Unicode code points match
4489 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4490 the previous implementation for almost all Unicode code points.
27d4fc33 4491
37e2e78e
KW
4492 There is a subtlety with Prepend* which showed up in testing.
4493 Note that the Begin, and only the Begin is required in:
4494 | Prepend* Begin Extend*
cc3b396d
KW
4495 Also, Begin contains '! Control'. A Prepend must be a
4496 '! Control', which means it must also be a Begin. What it
4497 comes down to is that if we match Prepend* and then find no
4498 suitable Begin afterwards, that if we backtrack the last
4499 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4500 */
4501
7016d6eb 4502 if (NEXTCHR_IS_EOS)
a0ed51b3 4503 sayNO;
f2ed9b32 4504 if (! utf8_target) {
37e2e78e
KW
4505
4506 /* Match either CR LF or '.', as all the other possibilities
4507 * require utf8 */
4508 locinput++; /* Match the . or CR */
cc3b396d
KW
4509 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4510 match the LF */
37e2e78e 4511 && locinput < PL_regeol
e699a1d5
KW
4512 && UCHARAT(locinput) == '\n')
4513 {
4514 locinput++;
4515 }
37e2e78e
KW
4516 }
4517 else {
4518
4519 /* Utf8: See if is ( CR LF ); already know that locinput <
4520 * PL_regeol, so locinput+1 is in bounds */
7016d6eb 4521 if ( nextchr == '\r' && locinput+1 < PL_regeol
f67f9e53 4522 && UCHARAT(locinput + 1) == '\n')
7016d6eb 4523 {
37e2e78e
KW
4524 locinput += 2;
4525 }
4526 else {
45fdf108
KW
4527 STRLEN len;
4528
37e2e78e
KW
4529 /* In case have to backtrack to beginning, then match '.' */
4530 char *starting = locinput;
4531
4532 /* In case have to backtrack the last prepend */
e699a1d5 4533 char *previous_prepend = NULL;
37e2e78e
KW
4534
4535 LOAD_UTF8_CHARCLASS_GCB();
4536
45fdf108
KW
4537 /* Match (prepend)* */
4538 while (locinput < PL_regeol
4539 && (len = is_GCB_Prepend_utf8(locinput)))
4540 {
4541 previous_prepend = locinput;
4542 locinput += len;
a1853d78 4543 }
37e2e78e
KW
4544
4545 /* As noted above, if we matched a prepend character, but
4546 * the next thing won't match, back off the last prepend we
4547 * matched, as it is guaranteed to match the begin */
4548 if (previous_prepend
4549 && (locinput >= PL_regeol
c101f46d
KW
4550 || (! swash_fetch(PL_utf8_X_regular_begin,
4551 (U8*)locinput, utf8_target)
bff53399 4552 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
c101f46d 4553 )
37e2e78e
KW
4554 {
4555 locinput = previous_prepend;
4556 }
4557
4558 /* Note that here we know PL_regeol > locinput, as we
4559 * tested that upon input to this switch case, and if we
4560 * moved locinput forward, we tested the result just above
4561 * and it either passed, or we backed off so that it will
4562 * now pass */
11dfcd49
KW
4563 if (swash_fetch(PL_utf8_X_regular_begin,
4564 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4565 locinput += UTF8SKIP(locinput);
4566 }
bff53399 4567 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
37e2e78e
KW
4568
4569 /* Here did not match the required 'Begin' in the
4570 * second term. So just match the very first
4571 * character, the '.' of the final term of the regex */
4572 locinput = starting + UTF8SKIP(starting);
27d4fc33 4573 goto exit_utf8;
37e2e78e
KW
4574 } else {
4575
11dfcd49
KW
4576 /* Here is a special begin. It can be composed of
4577 * several individual characters. One possibility is
4578 * RI+ */
45fdf108
KW
4579 if ((len = is_GCB_RI_utf8(locinput))) {
4580 locinput += len;
11dfcd49 4581 while (locinput < PL_regeol
45fdf108 4582 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4583 {
45fdf108 4584 locinput += len;
11dfcd49 4585 }
45fdf108
KW
4586 } else if ((len = is_GCB_T_utf8(locinput))) {
4587 /* Another possibility is T+ */
4588 locinput += len;
11dfcd49 4589 while (locinput < PL_regeol
45fdf108 4590 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4591 {
45fdf108 4592 locinput += len;
11dfcd49
KW
4593 }
4594 } else {
4595
4596 /* Here, neither RI+ nor T+; must be some other
4597 * Hangul. That means it is one of the others: L,
4598 * LV, LVT or V, and matches:
4599 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4600
4601 /* Match L* */
4602 while (locinput < PL_regeol
45fdf108 4603 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4604 {
45fdf108 4605 locinput += len;
11dfcd49 4606 }
37e2e78e 4607
11dfcd49
KW
4608 /* Here, have exhausted L*. If the next character
4609 * is not an LV, LVT nor V, it means we had to have
4610 * at least one L, so matches L+ in the original
4611 * equation, we have a complete hangul syllable.
4612 * Are done. */
4613
4614 if (locinput < PL_regeol
45fdf108 4615 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49 4616 {
11dfcd49 4617 /* Otherwise keep going. Must be LV, LVT or V.
7d43c479
KW
4618 * See if LVT, by first ruling out V, then LV */
4619 if (! is_GCB_V_utf8(locinput)
4620 /* All but every TCount one is LV */
4621 && (valid_utf8_to_uvchr((U8 *) locinput,
4622 NULL)
4623 - SBASE)
4624 % TCount != 0)
4625 {
11dfcd49
KW
4626 locinput += UTF8SKIP(locinput);
4627 } else {
4628
4629 /* Must be V or LV. Take it, then match
4630 * V* */
4631 locinput += UTF8SKIP(locinput);
4632 while (locinput < PL_regeol
45fdf108 4633 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4634 {
45fdf108 4635 locinput += len;
11dfcd49
KW
4636 }
4637 }
37e2e78e 4638
11dfcd49 4639 /* And any of LV, LVT, or V can be followed
45fdf108 4640 * by T* */
11dfcd49 4641 while (locinput < PL_regeol
45fdf108 4642 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4643 {
45fdf108 4644 locinput += len;
11dfcd49
KW
4645 }
4646 }
cd94d768 4647 }
11dfcd49 4648 }
37e2e78e 4649
11dfcd49
KW
4650 /* Match any extender */
4651 while (locinput < PL_regeol
4652 && swash_fetch(PL_utf8_X_extend,
4653 (U8*)locinput, utf8_target))
4654 {
4655 locinput += UTF8SKIP(locinput);
4656 }
37e2e78e 4657 }
27d4fc33 4658 exit_utf8:
37e2e78e
KW
4659 if (locinput > PL_regeol) sayNO;
4660 }
a0ed51b3 4661 break;
81714fb9 4662
3c0563b9 4663 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
4664 { /* The capture buffer cases. The ones beginning with N for the
4665 named buffers just convert to the equivalent numbered and
4666 pretend they were called as the corresponding numbered buffer
4667 op. */
26ecd678
TC
4668 /* don't initialize these in the declaration, it makes C++
4669 unhappy */
81714fb9 4670 char *s;
ff1157ca 4671 char type;
8368298a
TC
4672 re_fold_t folder;
4673 const U8 *fold_array;
26ecd678 4674 UV utf8_fold_flags;
8368298a 4675
272d35c9 4676 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4677 folder = foldEQ_locale;
4678 fold_array = PL_fold_locale;
4679 type = REFFL;
17580e7a 4680 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4681 goto do_nref;
4682
3c0563b9 4683 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
4684 folder = foldEQ_latin1;
4685 fold_array = PL_fold_latin1;
4686 type = REFFA;
4687 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4688 goto do_nref;
4689
3c0563b9 4690 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
4691 folder = foldEQ_latin1;
4692 fold_array = PL_fold_latin1;
4693 type = REFFU;
d513472c 4694 utf8_fold_flags = 0;
d7ef4b73
KW
4695 goto do_nref;
4696
3c0563b9 4697 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
4698 folder = foldEQ;
4699 fold_array = PL_fold;
4700 type = REFF;
d513472c 4701 utf8_fold_flags = 0;
d7ef4b73
KW
4702 goto do_nref;
4703
3c0563b9 4704 case NREF: /* /\g{name}/ */
d7ef4b73 4705 type = REF;
83d7b90b
KW
4706 folder = NULL;
4707 fold_array = NULL;
d513472c 4708 utf8_fold_flags = 0;
d7ef4b73
KW
4709 do_nref:
4710
4711 /* For the named back references, find the corresponding buffer
4712 * number */
0a4db386
YO
4713 n = reg_check_named_buff_matched(rex,scan);
4714
d7ef4b73 4715 if ( ! n ) {
81714fb9 4716 sayNO;
d7ef4b73
KW
4717 }
4718 goto do_nref_ref_common;
4719
3c0563b9 4720 case REFFL: /* /\1/il */
272d35c9 4721 RX_MATCH_TAINTED_on(reginfo->prog);
d7ef4b73
KW
4722 folder = foldEQ_locale;
4723 fold_array = PL_fold_locale;
17580e7a 4724 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
d7ef4b73
KW
4725 goto do_ref;
4726
3c0563b9 4727 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
4728 folder = foldEQ_latin1;
4729 fold_array = PL_fold_latin1;
4730 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4731 goto do_ref;
4732
3c0563b9 4733 case REFFU: /* /\1/iu */
d7ef4b73
KW
4734 folder = foldEQ_latin1;
4735 fold_array = PL_fold_latin1;
d513472c 4736 utf8_fold_flags = 0;
d7ef4b73
KW
4737 goto do_ref;
4738
3c0563b9 4739 case REFF: /* /\1/i */
d7ef4b73
KW
4740 folder = foldEQ;
4741 fold_array = PL_fold;
d513472c 4742 utf8_fold_flags = 0;
83d7b90b 4743 goto do_ref;
d7ef4b73 4744
3c0563b9 4745 case REF: /* /\1/ */
83d7b90b
KW
4746 folder = NULL;
4747 fold_array = NULL;
d513472c 4748 utf8_fold_flags = 0;
83d7b90b 4749
d7ef4b73 4750 do_ref:
81714fb9 4751 type = OP(scan);
d7ef4b73
KW
4752 n = ARG(scan); /* which paren pair */
4753
4754 do_nref_ref_common:
b93070ed 4755 ln = rex->offs[n].start;
2c2d71f5 4756 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
b93070ed 4757 if (rex->lastparen < n || ln == -1)
af3f8c16 4758 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 4759 if (ln == rex->offs[n].end)
a0d0e21e 4760 break;
a0ed51b3 4761
24d3c4a9 4762 s = PL_bostr + ln;
d7ef4b73 4763 if (type != REF /* REF can do byte comparison */
2f65c56d 4764 && (utf8_target || type == REFFU))
d7ef4b73
KW
4765 { /* XXX handle REFFL better */
4766 char * limit = PL_regeol;
4767
4768 /* This call case insensitively compares the entire buffer
4769 * at s, with the current input starting at locinput, but
4770 * not going off the end given by PL_regeol, and returns in
2db97d41 4771 * <limit> upon success, how much of the current input was
d7ef4b73 4772 * matched */
b93070ed 4773 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 4774 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
4775 {
4776 sayNO;
a0ed51b3 4777 }
d7ef4b73 4778 locinput = limit;
a0ed51b3
LW
4779 break;
4780 }
4781
d7ef4b73 4782 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
4783 if (!NEXTCHR_IS_EOS &&
4784 UCHARAT(s) != nextchr &&
81714fb9 4785 (type == REF ||
d7ef4b73 4786 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 4787 sayNO;
b93070ed 4788 ln = rex->offs[n].end - ln;
24d3c4a9 4789 if (locinput + ln > PL_regeol)
4633a7c4 4790 sayNO;
81714fb9 4791 if (ln > 1 && (type == REF
24d3c4a9 4792 ? memNE(s, locinput, ln)
d7ef4b73 4793 : ! folder(s, locinput, ln)))
4633a7c4 4794 sayNO;
24d3c4a9 4795 locinput += ln;
a0d0e21e 4796 break;
81714fb9 4797 }
3c0563b9
DM
4798
4799 case NOTHING: /* null op; e.g. the 'nothing' following
4800 * the '*' in m{(a+|b)*}' */
4801 break;
4802 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 4803 break;
3c0563b9
DM
4804
4805 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 4806 break;
40a82448
DM
4807
4808#undef ST
4809#define ST st->u.eval
c277df42 4810 {
c277df42 4811 SV *ret;
d2f13c59 4812 REGEXP *re_sv;
6bda09f9 4813 regexp *re;
f8fc2ecf 4814 regexp_internal *rei;
1a147d38
YO
4815 regnode *startpoint;
4816
3c0563b9 4817 case GOSTART: /* (?R) */
e7707071
YO
4818 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4819 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 4820 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 4821 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 4822 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
4823 Perl_croak(aTHX_
4824 "Pattern subroutine nesting without pos change"
4825 " exceeded limit in regex");
6bda09f9
YO
4826 } else {
4827 nochange_depth = 0;
1a147d38 4828 }
288b8c02 4829 re_sv = rex_sv;
6bda09f9 4830 re = rex;
f8fc2ecf 4831 rei = rexi;
1a147d38 4832 if (OP(scan)==GOSUB) {
6bda09f9
YO
4833 startpoint = scan + ARG2L(scan);
4834 ST.close_paren = ARG(scan);
4835 } else {
f8fc2ecf 4836 startpoint = rei->program+1;
6bda09f9
YO
4837 ST.close_paren = 0;
4838 }
4839 goto eval_recurse_doit;
118e2215 4840 assert(0); /* NOTREACHED */
3c0563b9 4841
6bda09f9
YO
4842 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4843 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 4844 if ( ++nochange_depth > max_nochange_depth )
1a147d38 4845 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
4846 } else {
4847 nochange_depth = 0;
4848 }
8e5e9ebe 4849 {
4aabdb9b 4850 /* execute the code in the {...} */
81ed78b2 4851
4aabdb9b 4852 dSP;
a6dc34f1 4853 IV before;
1f4d1a1e 4854 OP * const oop = PL_op;
4aabdb9b 4855 COP * const ocurcop = PL_curcop;
81ed78b2 4856 OP *nop;
d80618d2 4857 char *saved_regeol = PL_regeol;
91332126 4858 struct re_save_state saved_state;
81ed78b2 4859 CV *newcv;
91332126 4860
74088413 4861 /* save *all* paren positions */
92da3157 4862 regcppush(rex, 0, maxopenparen);
74088413
DM
4863 REGCP_SET(runops_cp);
4864
6562f1c4 4865 /* To not corrupt the existing regex state while executing the
b7f4cd04
FR
4866 * eval we would normally put it on the save stack, like with
4867 * save_re_context. However, re-evals have a weird scoping so we
4868 * can't just add ENTER/LEAVE here. With that, things like
4869 *
4870 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4871 *
4872 * would break, as they expect the localisation to be unwound
4873 * only when the re-engine backtracks through the bit that
4874 * localised it.
4875 *
4876 * What we do instead is just saving the state in a local c
4877 * variable.
4878 */
91332126 4879 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
81ed78b2 4880
81ed78b2
DM
4881 if (!caller_cv)
4882 caller_cv = find_runcv(NULL);
4883
4aabdb9b 4884 n = ARG(scan);
81ed78b2 4885
b30fcab9 4886 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 4887 newcv = (ReANY(
b30fcab9
DM
4888 (REGEXP*)(rexi->data->data[n])
4889 ))->qr_anoncv
81ed78b2
DM
4890 ;
4891 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
4892 }
4893 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
4894 newcv = caller_cv;
4895 nop = (OP*)rexi->data->data[n];
4896 assert(CvDEPTH(newcv));
68e2671b
DM
4897 }
4898 else {
d24ca0c5
DM
4899 /* literal with own CV */
4900 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
4901 newcv = rex->qr_anoncv;
4902 nop = (OP*)rexi->data->data[n];
68e2671b 4903 }
81ed78b2 4904
0e458318
DM
4905 /* normally if we're about to execute code from the same
4906 * CV that we used previously, we just use the existing
4907 * CX stack entry. However, its possible that in the
4908 * meantime we may have backtracked, popped from the save
4909 * stack, and undone the SAVECOMPPAD(s) associated with
4910 * PUSH_MULTICALL; in which case PL_comppad no longer
4911 * points to newcv's pad. */
4912 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4913 {
b0065247
DM
4914 U8 flags = (CXp_SUB_RE |
4915 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
0e458318 4916 if (last_pushed_cv) {
b0065247 4917 CHANGE_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
4918 }
4919 else {
b0065247 4920 PUSH_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
4921 }
4922 last_pushed_cv = newcv;
4923 }
c31ee3bb
DM
4924 else {
4925 /* these assignments are just to silence compiler
4926 * warnings */
4927 multicall_cop = NULL;
4928 newsp = NULL;
4929 }
0e458318
DM
4930 last_pad = PL_comppad;
4931
2e2e3f36
DM
4932 /* the initial nextstate you would normally execute
4933 * at the start of an eval (which would cause error
4934 * messages to come from the eval), may be optimised
4935 * away from the execution path in the regex code blocks;
4936 * so manually set PL_curcop to it initially */
4937 {
81ed78b2 4938 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
4939 assert(o->op_type == OP_NULL);
4940 if (o->op_targ == OP_SCOPE) {
4941 o = cUNOPo->op_first;
4942 }
4943 else {
4944 assert(o->op_targ == OP_LEAVE);
4945 o = cUNOPo->op_first;
4946 assert(o->op_type == OP_ENTER);
4947 o = o->op_sibling;
4948 }
4949
4950 if (o->op_type != OP_STUB) {
4951 assert( o->op_type == OP_NEXTSTATE
4952 || o->op_type == OP_DBSTATE
4953 || (o->op_type == OP_NULL
4954 && ( o->op_targ == OP_NEXTSTATE
4955 || o->op_targ == OP_DBSTATE
4956 )
4957 )
4958 );
4959 PL_curcop = (COP*)o;
4960 }
4961 }
81ed78b2 4962 nop = nop->op_next;
2e2e3f36 4963
24b23f37 4964 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
4965 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4966
b93070ed 4967 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4aabdb9b 4968
2bf803e2
YO
4969 if (sv_yes_mark) {
4970 SV *sv_mrk = get_sv("REGMARK", 1);
4971 sv_setsv(sv_mrk, sv_yes_mark);
4972 }
4973
81ed78b2
DM
4974 /* we don't use MULTICALL here as we want to call the
4975 * first op of the block of interest, rather than the
4976 * first op of the sub */
a6dc34f1 4977 before = (IV)(SP-PL_stack_base);
81ed78b2 4978 PL_op = nop;
8e5e9ebe
RGS
4979 CALLRUNOPS(aTHX); /* Scalar context. */
4980 SPAGAIN;
a6dc34f1 4981 if ((IV)(SP-PL_stack_base) == before)
075aa684 4982 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
4983 else {
4984 ret = POPs;
4985 PUTBACK;
4986 }
4aabdb9b 4987
e4bfbed3
DM
4988 /* before restoring everything, evaluate the returned
4989 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
4990 * PL_op or pad. Also need to process any magic vars
4991 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
4992
4993 PL_op = NULL;
4994
5e98dac2 4995 re_sv = NULL;
e4bfbed3
DM
4996 if (logical == 0) /* (?{})/ */
4997 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4998 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4999 sw = cBOOL(SvTRUE(ret));
5000 logical = 0;
5001 }
5002 else { /* /(??{}) */
497d0a96
DM
5003 /* if its overloaded, let the regex compiler handle
5004 * it; otherwise extract regex, or stringify */
5005 if (!SvAMAGIC(ret)) {
5006 SV *sv = ret;
5007 if (SvROK(sv))
5008 sv = SvRV(sv);
5009 if (SvTYPE(sv) == SVt_REGEXP)
5010 re_sv = (REGEXP*) sv;
5011 else if (SvSMAGICAL(sv)) {
5012 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5013 if (mg)
5014 re_sv = (REGEXP *) mg->mg_obj;
5015 }
e4bfbed3 5016
497d0a96
DM
5017 /* force any magic, undef warnings here */
5018 if (!re_sv) {
5019 ret = sv_mortalcopy(ret);
5020 (void) SvPV_force_nolen(ret);
5021 }
e4bfbed3
DM
5022 }
5023
5024 }
5025
91332126
FR
5026 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5027
81ed78b2
DM
5028 /* *** Note that at this point we don't restore
5029 * PL_comppad, (or pop the CxSUB) on the assumption it may
5030 * be used again soon. This is safe as long as nothing
5031 * in the regexp code uses the pad ! */
4aabdb9b 5032 PL_op = oop;
4aabdb9b 5033 PL_curcop = ocurcop;
d80618d2 5034 PL_regeol = saved_regeol;
92da3157 5035 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
e4bfbed3
DM
5036
5037 if (logical != 2)
4aabdb9b 5038 break;
8e5e9ebe 5039 }
e4bfbed3
DM
5040
5041 /* only /(??{})/ from now on */
24d3c4a9 5042 logical = 0;
4aabdb9b 5043 {
4f639d21
DM
5044 /* extract RE object from returned value; compiling if
5045 * necessary */
5c35adbb 5046
575c37f6
DM
5047 if (re_sv) {
5048 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 5049 }
0f5d15d6 5050 else {
c737faaf 5051 U32 pm_flags = 0;
0f5d15d6 5052
9753d940
DM
5053 if (SvUTF8(ret) && IN_BYTES) {
5054 /* In use 'bytes': make a copy of the octet
5055 * sequence, but without the flag on */
b9ad30b4
NC
5056 STRLEN len;
5057 const char *const p = SvPV(ret, len);
5058 ret = newSVpvn_flags(p, len, SVs_TEMP);
5059 }
732caac7
DM
5060 if (rex->intflags & PREGf_USE_RE_EVAL)
5061 pm_flags |= PMf_USE_RE_EVAL;
5062
5063 /* if we got here, it should be an engine which
5064 * supports compiling code blocks and stuff */
5065 assert(rex->engine && rex->engine->op_comp);
ec841a27 5066 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 5067 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
5068 rex->engine, NULL, NULL,
5069 /* copy /msix etc to inner pattern */
5070 scan->flags,
5071 pm_flags);
732caac7 5072
9041c2e3 5073 if (!(SvFLAGS(ret)
faf82a0b 5074 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3ce3ed55 5075 | SVs_GMG))) {
a2794585
NC
5076 /* This isn't a first class regexp. Instead, it's
5077 caching a regexp onto an existing, Perl visible
5078 scalar. */
575c37f6 5079 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 5080 }
74088413
DM
5081 /* safe to do now that any $1 etc has been
5082 * interpolated into the new pattern string and
5083 * compiled */
92da3157 5084 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
0f5d15d6 5085 }
e1ff3a88 5086 SAVEFREESV(re_sv);
8d919b0a 5087 re = ReANY(re_sv);
4aabdb9b 5088 }
07bc277f 5089 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
5090 re->subbeg = rex->subbeg;
5091 re->sublen = rex->sublen;
6502e081
DM
5092 re->suboffset = rex->suboffset;
5093 re->subcoffset = rex->subcoffset;
f8fc2ecf 5094 rei = RXi_GET(re);
6bda09f9 5095 DEBUG_EXECUTE_r(
f2ed9b32 5096 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
6bda09f9
YO
5097 "Matching embedded");
5098 );
f8fc2ecf 5099 startpoint = rei->program + 1;
1a147d38 5100 ST.close_paren = 0; /* only used for GOSUB */
aa283a38 5101
1a147d38 5102 eval_recurse_doit: /* Share code with GOSUB below this line */
aa283a38 5103 /* run the pattern returned from (??{...}) */
92da3157
DM
5104
5105 /* Save *all* the positions. */
5106 ST.cp = regcppush(rex, 0, maxopenparen);
40a82448 5107 REGCP_SET(ST.lastcp);
6bda09f9 5108
0357f1fd
ML
5109 re->lastparen = 0;
5110 re->lastcloseparen = 0;
5111
92da3157 5112 maxopenparen = 0;
4aabdb9b
DM
5113
5114 /* XXXX This is too dramatic a measure... */
5115 PL_reg_maxiter = 0;
5116
984e6dd1
DM
5117 ST.saved_utf8_pat = is_utf8_pat;
5118 is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 5119
288b8c02 5120 ST.prev_rex = rex_sv;
faec1544 5121 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5122 rex_sv = re_sv;
5123 SET_reg_curpm(rex_sv);
288b8c02 5124 rex = re;
f8fc2ecf 5125 rexi = rei;
faec1544 5126 cur_curlyx = NULL;
40a82448 5127 ST.B = next;
faec1544
DM
5128 ST.prev_eval = cur_eval;
5129 cur_eval = st;
faec1544 5130 /* now continue from first node in postoned RE */
4d5016e5 5131 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 5132 assert(0); /* NOTREACHED */
c277df42 5133 }
40a82448 5134
faec1544
DM
5135 case EVAL_AB: /* cleanup after a successful (??{A})B */
5136 /* note: this is called twice; first after popping B, then A */
984e6dd1 5137 is_utf8_pat = ST.saved_utf8_pat;
ec43f78b
DM
5138 rex_sv = ST.prev_rex;
5139 SET_reg_curpm(rex_sv);
8d919b0a 5140 rex = ReANY(rex_sv);
f8fc2ecf 5141 rexi = RXi_GET(rex);
faec1544
DM
5142 regcpblow(ST.cp);
5143 cur_eval = ST.prev_eval;
5144 cur_curlyx = ST.prev_curlyx;
34a81e2b 5145
40a82448
DM
5146 /* XXXX This is too dramatic a measure... */
5147 PL_reg_maxiter = 0;
e7707071 5148 if ( nochange_depth )
4b196cd4 5149 nochange_depth--;
262b90c4 5150 sayYES;
40a82448 5151
40a82448 5152
faec1544
DM
5153 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5154 /* note: this is called twice; first after popping B, then A */
984e6dd1 5155 is_utf8_pat = ST.saved_utf8_pat;
ec43f78b
DM
5156 rex_sv = ST.prev_rex;
5157 SET_reg_curpm(rex_sv);
8d919b0a 5158 rex = ReANY(rex_sv);
f8fc2ecf 5159 rexi = RXi_GET(rex);
0357f1fd 5160
40a82448 5161 REGCP_UNWIND(ST.lastcp);
92da3157 5162 regcppop(rex, &maxopenparen);
faec1544
DM
5163 cur_eval = ST.prev_eval;
5164 cur_curlyx = ST.prev_curlyx;
5165 /* XXXX This is too dramatic a measure... */
5166 PL_reg_maxiter = 0;
e7707071 5167 if ( nochange_depth )
4b196cd4 5168 nochange_depth--;
40a82448 5169 sayNO_SILENT;
40a82448
DM
5170#undef ST
5171
3c0563b9 5172 case OPEN: /* ( */
c277df42 5173 n = ARG(scan); /* which paren pair */
1ca2007e 5174 rex->offs[n].start_tmp = locinput - PL_bostr;
92da3157
DM
5175 if (n > maxopenparen)
5176 maxopenparen = n;
495f47a5 5177 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 5178 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
5179 PTR2UV(rex),
5180 PTR2UV(rex->offs),
5181 (UV)n,
5182 (IV)rex->offs[n].start_tmp,
92da3157 5183 (UV)maxopenparen
495f47a5 5184 ));
e2e6a0f1 5185 lastopen = n;
a0d0e21e 5186 break;
495f47a5
DM
5187
5188/* XXX really need to log other places start/end are set too */
5189#define CLOSE_CAPTURE \
5190 rex->offs[n].start = rex->offs[n].start_tmp; \
5191 rex->offs[n].end = locinput - PL_bostr; \
5192 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5193 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5194 PTR2UV(rex), \
5195 PTR2UV(rex->offs), \
5196 (UV)n, \
5197 (IV)rex->offs[n].start, \
5198 (IV)rex->offs[n].end \
5199 ))
5200
3c0563b9 5201 case CLOSE: /* ) */
c277df42 5202 n = ARG(scan); /* which paren pair */
495f47a5 5203 CLOSE_CAPTURE;
b93070ed
DM
5204 if (n > rex->lastparen)
5205 rex->lastparen = n;
5206 rex->lastcloseparen = n;
3b6647e0 5207 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5208 goto fake_end;
5209 }
a0d0e21e 5210 break;
3c0563b9
DM
5211
5212 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5213 if (ARG(scan)){
5214 regnode *cursor;
5215 for (cursor=scan;
5216 cursor && OP(cursor)!=END;
5217 cursor=regnext(cursor))
5218 {
5219 if ( OP(cursor)==CLOSE ){
5220 n = ARG(cursor);
5221 if ( n <= lastopen ) {
495f47a5 5222 CLOSE_CAPTURE;
b93070ed
DM
5223 if (n > rex->lastparen)
5224 rex->lastparen = n;
5225 rex->lastcloseparen = n;
3b6647e0
RB
5226 if ( n == ARG(scan) || (cur_eval &&
5227 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5228 break;
5229 }
5230 }
5231 }
5232 }
5233 goto fake_end;
5234 /*NOTREACHED*/
3c0563b9
DM
5235
5236 case GROUPP: /* (?(1)) */
c277df42 5237 n = ARG(scan); /* which paren pair */
b93070ed 5238 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5239 break;
3c0563b9
DM
5240
5241 case NGROUPP: /* (?(<name>)) */
0a4db386 5242 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5243 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5244 break;
3c0563b9
DM
5245
5246 case INSUBP: /* (?(R)) */
0a4db386 5247 n = ARG(scan);
3b6647e0 5248 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5249 break;
3c0563b9
DM
5250
5251 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5252 sw = 0;
5253 break;
3c0563b9
DM
5254
5255 case IFTHEN: /* (?(cond)A|B) */
2c2d71f5 5256 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
24d3c4a9 5257 if (sw)
c277df42
IZ
5258 next = NEXTOPER(NEXTOPER(scan));
5259 else {
5260 next = scan + ARG(scan);
5261 if (OP(next) == IFTHEN) /* Fake one. */
5262 next = NEXTOPER(NEXTOPER(next));
5263 }
5264 break;
3c0563b9
DM
5265
5266 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5267 logical = scan->flags;
c277df42 5268 break;
c476f425 5269
2ab05381 5270/*******************************************************************
2ab05381 5271
c476f425
DM
5272The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5273pattern, where A and B are subpatterns. (For simple A, CURLYM or
5274STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5275
c476f425 5276A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5277
c476f425
DM
5278On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5279state, which contains the current count, initialised to -1. It also sets
5280cur_curlyx to point to this state, with any previous value saved in the
5281state block.
2ab05381 5282
c476f425
DM
5283CURLYX then jumps straight to the WHILEM op, rather than executing A,
5284since the pattern may possibly match zero times (i.e. it's a while {} loop
5285rather than a do {} while loop).
2ab05381 5286
c476f425
DM
5287Each entry to WHILEM represents a successful match of A. The count in the
5288CURLYX block is incremented, another WHILEM state is pushed, and execution
5289passes to A or B depending on greediness and the current count.
2ab05381 5290
c476f425
DM
5291For example, if matching against the string a1a2a3b (where the aN are
5292substrings that match /A/), then the match progresses as follows: (the
5293pushed states are interspersed with the bits of strings matched so far):
2ab05381 5294
c476f425
DM
5295 <CURLYX cnt=-1>
5296 <CURLYX cnt=0><WHILEM>
5297 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5298 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5299 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5300 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5301
c476f425
DM
5302(Contrast this with something like CURLYM, which maintains only a single
5303backtrack state:
2ab05381 5304
c476f425
DM
5305 <CURLYM cnt=0> a1
5306 a1 <CURLYM cnt=1> a2
5307 a1 a2 <CURLYM cnt=2> a3
5308 a1 a2 a3 <CURLYM cnt=3> b
5309)
2ab05381 5310
c476f425
DM
5311Each WHILEM state block marks a point to backtrack to upon partial failure
5312of A or B, and also contains some minor state data related to that
5313iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5314overall state, such as the count, and pointers to the A and B ops.
2ab05381 5315
c476f425
DM
5316This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5317must always point to the *current* CURLYX block, the rules are:
2ab05381 5318
c476f425
DM
5319When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5320and set cur_curlyx to point the new block.
2ab05381 5321
c476f425
DM
5322When popping the CURLYX block after a successful or unsuccessful match,
5323restore the previous cur_curlyx.
2ab05381 5324
c476f425
DM
5325When WHILEM is about to execute B, save the current cur_curlyx, and set it
5326to the outer one saved in the CURLYX block.
2ab05381 5327
c476f425
DM
5328When popping the WHILEM block after a successful or unsuccessful B match,
5329restore the previous cur_curlyx.
2ab05381 5330
c476f425
DM
5331Here's an example for the pattern (AI* BI)*BO
5332I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5333
c476f425
DM
5334cur_
5335curlyx backtrack stack
5336------ ---------------
5337NULL
5338CO <CO prev=NULL> <WO>
5339CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5340CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5341NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5342
c476f425
DM
5343At this point the pattern succeeds, and we work back down the stack to
5344clean up, restoring as we go:
95b24440 5345
c476f425
DM
5346CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5347CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5348CO <CO prev=NULL> <WO>
5349NULL
a0374537 5350
c476f425
DM
5351*******************************************************************/
5352
5353#define ST st->u.curlyx
5354
5355 case CURLYX: /* start of /A*B/ (for complex A) */
5356 {
5357 /* No need to save/restore up to this paren */
5358 I32 parenfloor = scan->flags;
5359
5360 assert(next); /* keep Coverity happy */
5361 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5362 next += ARG(next);
5363
5364 /* XXXX Probably it is better to teach regpush to support
92da3157 5365 parenfloor > maxopenparen ... */
b93070ed
DM
5366 if (parenfloor > (I32)rex->lastparen)
5367 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5368
5369 ST.prev_curlyx= cur_curlyx;
5370 cur_curlyx = st;
5371 ST.cp = PL_savestack_ix;
5372
5373 /* these fields contain the state of the current curly.
5374 * they are accessed by subsequent WHILEMs */
5375 ST.parenfloor = parenfloor;
d02d6d97 5376 ST.me = scan;
c476f425 5377 ST.B = next;
24d3c4a9
DM
5378 ST.minmod = minmod;
5379 minmod = 0;
c476f425
DM
5380 ST.count = -1; /* this will be updated by WHILEM */
5381 ST.lastloc = NULL; /* this will be updated by WHILEM */
5382
4d5016e5 5383 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5384 assert(0); /* NOTREACHED */
c476f425 5385 }
a0d0e21e 5386
c476f425 5387 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5388 cur_curlyx = ST.prev_curlyx;
5389 sayYES;
118e2215 5390 assert(0); /* NOTREACHED */
a0d0e21e 5391
c476f425
DM
5392 case CURLYX_end_fail: /* just failed to match all of A*B */
5393 regcpblow(ST.cp);
5394 cur_curlyx = ST.prev_curlyx;
5395 sayNO;
118e2215 5396 assert(0); /* NOTREACHED */
4633a7c4 5397
a0d0e21e 5398
c476f425
DM
5399#undef ST
5400#define ST st->u.whilem
5401
5402 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5403 {
5404 /* see the discussion above about CURLYX/WHILEM */
c476f425 5405 I32 n;
d02d6d97
DM
5406 int min = ARG1(cur_curlyx->u.curlyx.me);
5407 int max = ARG2(cur_curlyx->u.curlyx.me);
5408 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5409
c476f425
DM
5410 assert(cur_curlyx); /* keep Coverity happy */
5411 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5412 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5413 ST.cache_offset = 0;
5414 ST.cache_mask = 0;
5415
c476f425
DM
5416
5417 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5418 "%*s whilem: matched %ld out of %d..%d\n",
5419 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5420 );
a0d0e21e 5421
c476f425 5422 /* First just match a string of min A's. */
a0d0e21e 5423
d02d6d97 5424 if (n < min) {
92da3157
DM
5425 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5426 maxopenparen);
c476f425 5427 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5428 REGCP_SET(ST.lastcp);
5429
4d5016e5 5430 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5431 assert(0); /* NOTREACHED */
c476f425
DM
5432 }
5433
5434 /* If degenerate A matches "", assume A done. */
5435
5436 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5437 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5438 "%*s whilem: empty match detected, trying continuation...\n",
5439 REPORT_CODE_OFF+depth*2, "")
5440 );
5441 goto do_whilem_B_max;
5442 }
5443
5444 /* super-linear cache processing */
5445
5446 if (scan->flags) {
a0d0e21e 5447
2c2d71f5 5448 if (!PL_reg_maxiter) {
c476f425
DM
5449 /* start the countdown: Postpone detection until we
5450 * know the match is not *that* much linear. */
2c2d71f5 5451 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
66bf836d
DM
5452 /* possible overflow for long strings and many CURLYX's */
5453 if (PL_reg_maxiter < 0)
5454 PL_reg_maxiter = I32_MAX;
2c2d71f5
JH
5455 PL_reg_leftiter = PL_reg_maxiter;
5456 }
c476f425 5457
2c2d71f5 5458 if (PL_reg_leftiter-- == 0) {
c476f425 5459 /* initialise cache */
3298f257 5460 const I32 size = (PL_reg_maxiter + 7)/8;
2c2d71f5 5461 if (PL_reg_poscache) {
eb160463 5462 if ((I32)PL_reg_poscache_size < size) {
2c2d71f5
JH
5463 Renew(PL_reg_poscache, size, char);
5464 PL_reg_poscache_size = size;
5465 }
5466 Zero(PL_reg_poscache, size, char);
5467 }
5468 else {
5469 PL_reg_poscache_size = size;
a02a5408 5470 Newxz(PL_reg_poscache, size, char);
2c2d71f5 5471 }
c476f425
DM
5472 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5473 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5474 PL_colors[4], PL_colors[5])
5475 );
2c2d71f5 5476 }
c476f425 5477
2c2d71f5 5478 if (PL_reg_leftiter < 0) {
c476f425
DM
5479 /* have we already failed at this position? */
5480 I32 offset, mask;
5481 offset = (scan->flags & 0xf) - 1
5482 + (locinput - PL_bostr) * (scan->flags>>4);
5483 mask = 1 << (offset % 8);
5484 offset /= 8;
5485 if (PL_reg_poscache[offset] & mask) {
5486 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5487 "%*s whilem: (cache) already tried at this position...\n",
5488 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5489 );
3298f257 5490 sayNO; /* cache records failure */
2c2d71f5 5491 }
c476f425
DM
5492 ST.cache_offset = offset;
5493 ST.cache_mask = mask;
2c2d71f5 5494 }
c476f425 5495 }
2c2d71f5 5496
c476f425 5497 /* Prefer B over A for minimal matching. */
a687059c 5498
c476f425
DM
5499 if (cur_curlyx->u.curlyx.minmod) {
5500 ST.save_curlyx = cur_curlyx;
5501 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
5502 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5503 maxopenparen);
c476f425 5504 REGCP_SET(ST.lastcp);
4d5016e5
DM
5505 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5506 locinput);
118e2215 5507 assert(0); /* NOTREACHED */
c476f425 5508 }
a0d0e21e 5509
c476f425
DM
5510 /* Prefer A over B for maximal matching. */
5511
d02d6d97 5512 if (n < max) { /* More greed allowed? */
92da3157
DM
5513 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5514 maxopenparen);
c476f425
DM
5515 cur_curlyx->u.curlyx.lastloc = locinput;
5516 REGCP_SET(ST.lastcp);
4d5016e5 5517 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5518 assert(0); /* NOTREACHED */
c476f425
DM
5519 }
5520 goto do_whilem_B_max;
5521 }
118e2215 5522 assert(0); /* NOTREACHED */
c476f425
DM
5523
5524 case WHILEM_B_min: /* just matched B in a minimal match */
5525 case WHILEM_B_max: /* just matched B in a maximal match */
5526 cur_curlyx = ST.save_curlyx;
5527 sayYES;
118e2215 5528 assert(0); /* NOTREACHED */
c476f425
DM
5529
5530 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5531 cur_curlyx = ST.save_curlyx;
5532 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5533 cur_curlyx->u.curlyx.count--;
5534 CACHEsayNO;
118e2215 5535 assert(0); /* NOTREACHED */
c476f425
DM
5536
5537 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
5538 /* FALL THROUGH */
5539 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 5540 REGCP_UNWIND(ST.lastcp);
92da3157 5541 regcppop(rex, &maxopenparen);
c476f425
DM
5542 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5543 cur_curlyx->u.curlyx.count--;
5544 CACHEsayNO;
118e2215 5545 assert(0); /* NOTREACHED */
c476f425
DM
5546
5547 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5548 REGCP_UNWIND(ST.lastcp);
92da3157 5549 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
5550 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5551 "%*s whilem: failed, trying continuation...\n",
5552 REPORT_CODE_OFF+depth*2, "")
5553 );
5554 do_whilem_B_max:
5555 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5556 && ckWARN(WARN_REGEXP)
39819bd9 5557 && !reginfo->warned)
c476f425 5558 {
39819bd9 5559 reginfo->warned = TRUE;
dcbac5bb
FC
5560 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5561 "Complex regular subexpression recursion limit (%d) "
5562 "exceeded",
c476f425
DM
5563 REG_INFTY - 1);
5564 }
5565
5566 /* now try B */
5567 ST.save_curlyx = cur_curlyx;
5568 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5569 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5570 locinput);
118e2215 5571 assert(0); /* NOTREACHED */
c476f425
DM
5572
5573 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5574 cur_curlyx = ST.save_curlyx;
5575 REGCP_UNWIND(ST.lastcp);
92da3157 5576 regcppop(rex, &maxopenparen);
c476f425 5577
d02d6d97 5578 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5579 /* Maximum greed exceeded */
5580 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5581 && ckWARN(WARN_REGEXP)
39819bd9 5582 && !reginfo->warned)
c476f425 5583 {
39819bd9 5584 reginfo->warned = TRUE;
c476f425 5585 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5586 "Complex regular subexpression recursion "
5587 "limit (%d) exceeded",
c476f425 5588 REG_INFTY - 1);
a0d0e21e 5589 }
c476f425 5590 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5591 CACHEsayNO;
a0d0e21e 5592 }
c476f425
DM
5593
5594 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5595 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5596 );
5597 /* Try grabbing another A and see if it helps. */
c476f425 5598 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
5599 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5600 maxopenparen);
c476f425 5601 REGCP_SET(ST.lastcp);
d02d6d97 5602 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5603 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5604 locinput);
118e2215 5605 assert(0); /* NOTREACHED */
40a82448
DM
5606
5607#undef ST
5608#define ST st->u.branch
5609
5610 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
5611 next = scan + ARG(scan);
5612 if (next == scan)
5613 next = NULL;
40a82448
DM
5614 scan = NEXTOPER(scan);
5615 /* FALL THROUGH */
c277df42 5616
40a82448
DM
5617 case BRANCH: /* /(...|A|...)/ */
5618 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 5619 ST.lastparen = rex->lastparen;
f6033a9d 5620 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
5621 ST.next_branch = next;
5622 REGCP_SET(ST.cp);
02db2b7b 5623
40a82448 5624 /* Now go into the branch */
5d458dd8 5625 if (has_cutgroup) {
4d5016e5 5626 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5627 } else {
4d5016e5 5628 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 5629 }
118e2215 5630 assert(0); /* NOTREACHED */
3c0563b9
DM
5631
5632 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 5633 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 5634 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 5635 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 5636 assert(0); /* NOTREACHED */
3c0563b9 5637
5d458dd8
YO
5638 case CUTGROUP_next_fail:
5639 do_cutgroup = 1;
5640 no_final = 1;
5641 if (st->u.mark.mark_name)
5642 sv_commit = st->u.mark.mark_name;
5643 sayNO;
118e2215 5644 assert(0); /* NOTREACHED */
3c0563b9 5645
5d458dd8
YO
5646 case BRANCH_next:
5647 sayYES;
118e2215 5648 assert(0); /* NOTREACHED */
3c0563b9 5649
40a82448 5650 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
5651 if (do_cutgroup) {
5652 do_cutgroup = 0;
5653 no_final = 0;
5654 }
40a82448 5655 REGCP_UNWIND(ST.cp);
a8d1f4b4 5656 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
5657 scan = ST.next_branch;
5658 /* no more branches? */
5d458dd8
YO
5659 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5660 DEBUG_EXECUTE_r({
5661 PerlIO_printf( Perl_debug_log,
5662 "%*s %sBRANCH failed...%s\n",
5663 REPORT_CODE_OFF+depth*2, "",
5664 PL_colors[4],
5665 PL_colors[5] );
5666 });
5667 sayNO_SILENT;
5668 }
40a82448 5669 continue; /* execute next BRANCH[J] op */
118e2215 5670 assert(0); /* NOTREACHED */
40a82448 5671
3c0563b9 5672 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 5673 minmod = 1;
a0d0e21e 5674 break;
40a82448
DM
5675
5676#undef ST
5677#define ST st->u.curlym
5678
5679 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5680
5681 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 5682 * only a single backtracking state, no matter how many matches
40a82448
DM
5683 * there are in {m,n}. It relies on the pattern being constant
5684 * length, with no parens to influence future backrefs
5685 */
5686
5687 ST.me = scan;
dc45a647 5688 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 5689
f6033a9d
DM
5690 ST.lastparen = rex->lastparen;
5691 ST.lastcloseparen = rex->lastcloseparen;
5692
40a82448
DM
5693 /* if paren positive, emulate an OPEN/CLOSE around A */
5694 if (ST.me->flags) {
3b6647e0 5695 U32 paren = ST.me->flags;
92da3157
DM
5696 if (paren > maxopenparen)
5697 maxopenparen = paren;
c277df42 5698 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 5699 }
40a82448
DM
5700 ST.A = scan;
5701 ST.B = next;
5702 ST.alen = 0;
5703 ST.count = 0;
24d3c4a9
DM
5704 ST.minmod = minmod;
5705 minmod = 0;
40a82448
DM
5706 ST.c1 = CHRTEST_UNINIT;
5707 REGCP_SET(ST.cp);
6407bf3b 5708
40a82448
DM
5709 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5710 goto curlym_do_B;
5711
5712 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 5713 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 5714 assert(0); /* NOTREACHED */
5f80c4cf 5715
40a82448 5716 case CURLYM_A: /* we've just matched an A */
40a82448
DM
5717 ST.count++;
5718 /* after first match, determine A's length: u.curlym.alen */
5719 if (ST.count == 1) {
5720 if (PL_reg_match_utf8) {
c07e9d7b
DM
5721 char *s = st->locinput;
5722 while (s < locinput) {
40a82448
DM
5723 ST.alen++;
5724 s += UTF8SKIP(s);
5725 }
5726 }
5727 else {
c07e9d7b 5728 ST.alen = locinput - st->locinput;
40a82448
DM
5729 }
5730 if (ST.alen == 0)
5731 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5732 }
0cadcf80
DM
5733 DEBUG_EXECUTE_r(
5734 PerlIO_printf(Perl_debug_log,
40a82448 5735 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 5736 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 5737 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
5738 );
5739
0a4db386 5740 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5741 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5742 goto fake_end;
5743
c966426a
DM
5744 {
5745 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5746 if ( max == REG_INFTY || ST.count < max )
5747 goto curlym_do_A; /* try to match another A */
5748 }
40a82448 5749 goto curlym_do_B; /* try to match B */
5f80c4cf 5750
40a82448
DM
5751 case CURLYM_A_fail: /* just failed to match an A */
5752 REGCP_UNWIND(ST.cp);
0a4db386
YO
5753
5754 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5755 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5756 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 5757 sayNO;
0cadcf80 5758
40a82448 5759 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
5760 if (ST.c1 == CHRTEST_UNINIT) {
5761 /* calculate c1 and c2 for possible match of 1st char
5762 * following curly */
5763 ST.c1 = ST.c2 = CHRTEST_VOID;
5764 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5765 regnode *text_node = ST.B;
5766 if (! HAS_TEXT(text_node))
5767 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
5768 /* this used to be
5769
5770 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5771
5772 But the former is redundant in light of the latter.
5773
5774 if this changes back then the macro for
5775 IS_TEXT and friends need to change.
5776 */
c74f6de9 5777 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 5778 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1
DM
5779 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5780 is_utf8_pat))
c74f6de9
KW
5781 {
5782 sayNO;
5783 }
c277df42 5784 }
c277df42 5785 }
40a82448
DM
5786 }
5787
5788 DEBUG_EXECUTE_r(
5789 PerlIO_printf(Perl_debug_log,
5790 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 5791 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
5792 "", (IV)ST.count)
5793 );
c74f6de9 5794 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
5795 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5796 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5797 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5798 {
5799 /* simulate B failing */
5800 DEBUG_OPTIMISE_r(
5801 PerlIO_printf(Perl_debug_log,
5802 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5803 (int)(REPORT_CODE_OFF+(depth*2)),"",
5804 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5805 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5806 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5807 );
5808 state_num = CURLYM_B_fail;
5809 goto reenter_switch;
5810 }
5811 }
5812 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
5813 /* simulate B failing */
5814 DEBUG_OPTIMISE_r(
5815 PerlIO_printf(Perl_debug_log,
79a2a0e8 5816 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5400f398 5817 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
5818 (int) nextchr, ST.c1, ST.c2)
5819 );
5400f398
KW
5820 state_num = CURLYM_B_fail;
5821 goto reenter_switch;
5822 }
c74f6de9 5823 }
40a82448
DM
5824
5825 if (ST.me->flags) {
f6033a9d 5826 /* emulate CLOSE: mark current A as captured */
40a82448
DM
5827 I32 paren = ST.me->flags;
5828 if (ST.count) {
b93070ed 5829 rex->offs[paren].start
c07e9d7b
DM
5830 = HOPc(locinput, -ST.alen) - PL_bostr;
5831 rex->offs[paren].end = locinput - PL_bostr;
f6033a9d
DM
5832 if ((U32)paren > rex->lastparen)
5833 rex->lastparen = paren;
5834 rex->lastcloseparen = paren;
c277df42 5835 }
40a82448 5836 else
b93070ed 5837 rex->offs[paren].end = -1;
0a4db386 5838 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 5839 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
5840 {
5841 if (ST.count)
5842 goto fake_end;
5843 else
5844 sayNO;
5845 }
c277df42 5846 }
0a4db386 5847
4d5016e5 5848 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 5849 assert(0); /* NOTREACHED */
40a82448
DM
5850
5851 case CURLYM_B_fail: /* just failed to match a B */
5852 REGCP_UNWIND(ST.cp);
a8d1f4b4 5853 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 5854 if (ST.minmod) {
84d2fa14
HS
5855 I32 max = ARG2(ST.me);
5856 if (max != REG_INFTY && ST.count == max)
40a82448
DM
5857 sayNO;
5858 goto curlym_do_A; /* try to match a further A */
5859 }
5860 /* backtrack one A */
5861 if (ST.count == ARG1(ST.me) /* min */)
5862 sayNO;
5863 ST.count--;
7016d6eb 5864 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
5865 goto curlym_do_B; /* try to match B */
5866
c255a977
DM
5867#undef ST
5868#define ST st->u.curly
40a82448 5869
c255a977
DM
5870#define CURLY_SETPAREN(paren, success) \
5871 if (paren) { \
5872 if (success) { \
b93070ed
DM
5873 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5874 rex->offs[paren].end = locinput - PL_bostr; \
f6033a9d
DM
5875 if (paren > rex->lastparen) \
5876 rex->lastparen = paren; \
b93070ed 5877 rex->lastcloseparen = paren; \
c255a977 5878 } \
f6033a9d 5879 else { \
b93070ed 5880 rex->offs[paren].end = -1; \
f6033a9d
DM
5881 rex->lastparen = ST.lastparen; \
5882 rex->lastcloseparen = ST.lastcloseparen; \
5883 } \
c255a977
DM
5884 }
5885
b40a2c17 5886 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
5887 ST.paren = 0;
5888 ST.min = 0;
5889 ST.max = REG_INFTY;
a0d0e21e
LW
5890 scan = NEXTOPER(scan);
5891 goto repeat;
3c0563b9 5892
b40a2c17 5893 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
5894 ST.paren = 0;
5895 ST.min = 1;
5896 ST.max = REG_INFTY;
c277df42 5897 scan = NEXTOPER(scan);
c255a977 5898 goto repeat;
3c0563b9 5899
b40a2c17 5900 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
5901 ST.paren = scan->flags; /* Which paren to set */
5902 ST.lastparen = rex->lastparen;
f6033a9d 5903 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
5904 if (ST.paren > maxopenparen)
5905 maxopenparen = ST.paren;
c255a977
DM
5906 ST.min = ARG1(scan); /* min to match */
5907 ST.max = ARG2(scan); /* max to match */
0a4db386 5908 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 5909 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
5910 ST.min=1;
5911 ST.max=1;
5912 }
c255a977
DM
5913 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5914 goto repeat;
3c0563b9 5915
b40a2c17 5916 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
5917 ST.paren = 0;
5918 ST.min = ARG1(scan); /* min to match */
5919 ST.max = ARG2(scan); /* max to match */
5920 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 5921 repeat:
a0d0e21e
LW
5922 /*
5923 * Lookahead to avoid useless match attempts
5924 * when we know what character comes next.
c255a977 5925 *
5f80c4cf
JP
5926 * Used to only do .*x and .*?x, but now it allows
5927 * for )'s, ('s and (?{ ... })'s to be in the way
5928 * of the quantifier and the EXACT-like node. -- japhy
5929 */
5930
eb5c1be8 5931 assert(ST.min <= ST.max);
3337dfe3
KW
5932 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5933 ST.c1 = ST.c2 = CHRTEST_VOID;
5934 }
5935 else {
5f80c4cf
JP
5936 regnode *text_node = next;
5937
3dab1dad
YO
5938 if (! HAS_TEXT(text_node))
5939 FIND_NEXT_IMPT(text_node);
5f80c4cf 5940
9e137952 5941 if (! HAS_TEXT(text_node))
c255a977 5942 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 5943 else {
ee9b8eae 5944 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 5945 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 5946 }
c74f6de9 5947 else {
ee9b8eae
YO
5948
5949 /* Currently we only get here when
5950
5951 PL_rekind[OP(text_node)] == EXACT
5952
5953 if this changes back then the macro for IS_TEXT and
5954 friends need to change. */
79a2a0e8 5955 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1
DM
5956 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5957 is_utf8_pat))
c74f6de9
KW
5958 {
5959 sayNO;
5960 }
5961 }
1aa99e6b 5962 }
bbce6d69 5963 }
c255a977
DM
5964
5965 ST.A = scan;
5966 ST.B = next;
24d3c4a9 5967 if (minmod) {
eb72505d 5968 char *li = locinput;
24d3c4a9 5969 minmod = 0;
984e6dd1
DM
5970 if (ST.min &&
5971 regrepeat(rex, &li, ST.A, ST.min, depth, is_utf8_pat)
5972 < ST.min)
4633a7c4 5973 sayNO;
7016d6eb 5974 SET_locinput(li);
c255a977 5975 ST.count = ST.min;
c255a977
DM
5976 REGCP_SET(ST.cp);
5977 if (ST.c1 == CHRTEST_VOID)
5978 goto curly_try_B_min;
5979
5980 ST.oldloc = locinput;
5981
5982 /* set ST.maxpos to the furthest point along the
5983 * string that could possibly match */
5984 if (ST.max == REG_INFTY) {
5985 ST.maxpos = PL_regeol - 1;
f2ed9b32 5986 if (utf8_target)
c255a977
DM
5987 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5988 ST.maxpos--;
5989 }
f2ed9b32 5990 else if (utf8_target) {
c255a977
DM
5991 int m = ST.max - ST.min;
5992 for (ST.maxpos = locinput;
9a902117 5993 m >0 && ST.maxpos < PL_regeol; m--)
c255a977
DM
5994 ST.maxpos += UTF8SKIP(ST.maxpos);
5995 }
5996 else {
5997 ST.maxpos = locinput + ST.max - ST.min;
5998 if (ST.maxpos >= PL_regeol)
5999 ST.maxpos = PL_regeol - 1;
6000 }
6001 goto curly_try_B_min_known;
6002
6003 }
6004 else {
eb72505d
DM
6005 /* avoid taking address of locinput, so it can remain
6006 * a register var */
6007 char *li = locinput;
984e6dd1
DM
6008 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth,
6009 is_utf8_pat);
c255a977
DM
6010 if (ST.count < ST.min)
6011 sayNO;
7016d6eb 6012 SET_locinput(li);
c255a977
DM
6013 if ((ST.count > ST.min)
6014 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6015 {
6016 /* A{m,n} must come at the end of the string, there's
6017 * no point in backing off ... */
6018 ST.min = ST.count;
6019 /* ...except that $ and \Z can match before *and* after
6020 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6021 We may back off by one in this case. */
eb72505d 6022 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
6023 ST.min--;
6024 }
6025 REGCP_SET(ST.cp);
6026 goto curly_try_B_max;
6027 }
118e2215 6028 assert(0); /* NOTREACHED */
c255a977
DM
6029
6030
6031 case CURLY_B_min_known_fail:
6032 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 6033
c255a977 6034 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6035 if (ST.paren) {
6036 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6037 }
c255a977
DM
6038 /* Couldn't or didn't -- move forward. */
6039 ST.oldloc = locinput;
f2ed9b32 6040 if (utf8_target)
c255a977
DM
6041 locinput += UTF8SKIP(locinput);
6042 else
6043 locinput++;
6044 ST.count++;
6045 curly_try_B_min_known:
6046 /* find the next place where 'B' could work, then call B */
6047 {
6048 int n;
f2ed9b32 6049 if (utf8_target) {
c255a977
DM
6050 n = (ST.oldloc == locinput) ? 0 : 1;
6051 if (ST.c1 == ST.c2) {
c255a977 6052 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6053 while (locinput <= ST.maxpos
6054 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6055 {
6056 locinput += UTF8SKIP(locinput);
c255a977
DM
6057 n++;
6058 }
1aa99e6b
IH
6059 }
6060 else {
c255a977 6061 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6062 while (locinput <= ST.maxpos
6063 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6064 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6065 {
6066 locinput += UTF8SKIP(locinput);
c255a977 6067 n++;
1aa99e6b 6068 }
0fe9bf95
IZ
6069 }
6070 }
5400f398 6071 else { /* Not utf8_target */
c255a977
DM
6072 if (ST.c1 == ST.c2) {
6073 while (locinput <= ST.maxpos &&
6074 UCHARAT(locinput) != ST.c1)
6075 locinput++;
bbce6d69 6076 }
c255a977
DM
6077 else {
6078 while (locinput <= ST.maxpos
6079 && UCHARAT(locinput) != ST.c1
6080 && UCHARAT(locinput) != ST.c2)
6081 locinput++;
a0ed51b3 6082 }
c255a977
DM
6083 n = locinput - ST.oldloc;
6084 }
6085 if (locinput > ST.maxpos)
6086 sayNO;
c255a977 6087 if (n) {
eb72505d
DM
6088 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6089 * at b; check that everything between oldloc and
6090 * locinput matches */
6091 char *li = ST.oldloc;
c255a977 6092 ST.count += n;
984e6dd1 6093 if (regrepeat(rex, &li, ST.A, n, depth, is_utf8_pat) < n)
4633a7c4 6094 sayNO;
eb72505d 6095 assert(n == REG_INFTY || locinput == li);
a0d0e21e 6096 }
c255a977 6097 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6098 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6099 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6100 goto fake_end;
6101 }
4d5016e5 6102 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 6103 }
118e2215 6104 assert(0); /* NOTREACHED */
c255a977
DM
6105
6106
6107 case CURLY_B_min_fail:
6108 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
6109
6110 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6111 if (ST.paren) {
6112 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6113 }
c255a977 6114 /* failed -- move forward one */
f73aaa43 6115 {
eb72505d 6116 char *li = locinput;
984e6dd1 6117 if (!regrepeat(rex, &li, ST.A, 1, depth, is_utf8_pat)) {
f73aaa43
DM
6118 sayNO;
6119 }
eb72505d 6120 locinput = li;
f73aaa43
DM
6121 }
6122 {
c255a977 6123 ST.count++;
c255a977
DM
6124 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6125 ST.count > 0)) /* count overflow ? */
15272685 6126 {
c255a977
DM
6127 curly_try_B_min:
6128 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6129 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6130 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6131 goto fake_end;
6132 }
4d5016e5 6133 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6134 }
6135 }
c74f6de9 6136 sayNO;
118e2215 6137 assert(0); /* NOTREACHED */
c255a977
DM
6138
6139
6140 curly_try_B_max:
6141 /* a successful greedy match: now try to match B */
40d049e4 6142 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6143 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6144 goto fake_end;
6145 }
c255a977 6146 {
79a2a0e8
KW
6147 bool could_match = locinput < PL_regeol;
6148
c255a977 6149 /* If it could work, try it. */
79a2a0e8
KW
6150 if (ST.c1 != CHRTEST_VOID && could_match) {
6151 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6152 {
6153 could_match = memEQ(locinput,
6154 ST.c1_utf8,
6155 UTF8SKIP(locinput))
6156 || memEQ(locinput,
6157 ST.c2_utf8,
6158 UTF8SKIP(locinput));
6159 }
6160 else {
6161 could_match = UCHARAT(locinput) == ST.c1
6162 || UCHARAT(locinput) == ST.c2;
6163 }
6164 }
6165 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6166 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6167 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 6168 assert(0); /* NOTREACHED */
c255a977
DM
6169 }
6170 }
6171 /* FALL THROUGH */
3c0563b9 6172
c255a977
DM
6173 case CURLY_B_max_fail:
6174 /* failed to find B in a greedy match */
c255a977
DM
6175
6176 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6177 if (ST.paren) {
6178 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6179 }
c255a977
DM
6180 /* back up. */
6181 if (--ST.count < ST.min)
6182 sayNO;
eb72505d 6183 locinput = HOPc(locinput, -1);
c255a977
DM
6184 goto curly_try_B_max;
6185
6186#undef ST
6187
3c0563b9 6188 case END: /* last op of main pattern */
6bda09f9 6189 fake_end:
faec1544
DM
6190 if (cur_eval) {
6191 /* we've just finished A in /(??{A})B/; now continue with B */
984e6dd1
DM
6192 st->u.eval.saved_utf8_pat = is_utf8_pat;
6193 is_utf8_pat = cur_eval->u.eval.saved_utf8_pat;
faec1544 6194
288b8c02 6195 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
6196
6197 /* Save *all* the positions. */
6198 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b
DM
6199 rex_sv = cur_eval->u.eval.prev_rex;
6200 SET_reg_curpm(rex_sv);
8d919b0a 6201 rex = ReANY(rex_sv);
f8fc2ecf 6202 rexi = RXi_GET(rex);
faec1544 6203 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6204
faec1544 6205 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6206
6207 /* Restore parens of the outer rex without popping the
6208 * savestack */
92da3157
DM
6209 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6210 &maxopenparen);
faec1544
DM
6211
6212 st->u.eval.prev_eval = cur_eval;
6213 cur_eval = cur_eval->u.eval.prev_eval;
6214 DEBUG_EXECUTE_r(
2a49f0f5
JH
6215 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6216 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6217 if ( nochange_depth )
6218 nochange_depth--;
6219
4d5016e5
DM
6220 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6221 locinput); /* match B */
faec1544
DM
6222 }
6223
3b0527fe 6224 if (locinput < reginfo->till) {
a3621e74 6225 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6226 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6227 PL_colors[4],
6228 (long)(locinput - PL_reg_starttry),
3b0527fe 6229 (long)(reginfo->till - PL_reg_starttry),
7821416a 6230 PL_colors[5]));
58e23c8d 6231
262b90c4 6232 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6233 }
262b90c4 6234 sayYES; /* Success! */
dad79028
DM
6235
6236 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6237 DEBUG_EXECUTE_r(
6238 PerlIO_printf(Perl_debug_log,
6239 "%*s %ssubpattern success...%s\n",
5bc10b2c 6240 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6241 sayYES; /* Success! */
dad79028 6242
40a82448
DM
6243#undef ST
6244#define ST st->u.ifmatch
6245
37f53970
DM
6246 {
6247 char *newstart;
6248
40a82448
DM
6249 case SUSPEND: /* (?>A) */
6250 ST.wanted = 1;
37f53970 6251 newstart = locinput;
9041c2e3 6252 goto do_ifmatch;
dad79028 6253
40a82448
DM
6254 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6255 ST.wanted = 0;
dad79028
DM
6256 goto ifmatch_trivial_fail_test;
6257
40a82448
DM
6258 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6259 ST.wanted = 1;
dad79028 6260 ifmatch_trivial_fail_test:
a0ed51b3 6261 if (scan->flags) {
52657f30 6262 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6263 if (!s) {
6264 /* trivial fail */
24d3c4a9
DM
6265 if (logical) {
6266 logical = 0;
f2338a2e 6267 sw = 1 - cBOOL(ST.wanted);
dad79028 6268 }
40a82448 6269 else if (ST.wanted)
dad79028
DM
6270 sayNO;
6271 next = scan + ARG(scan);
6272 if (next == scan)
6273 next = NULL;
6274 break;
6275 }
37f53970 6276 newstart = s;
a0ed51b3
LW
6277 }
6278 else
37f53970 6279 newstart = locinput;
a0ed51b3 6280
c277df42 6281 do_ifmatch:
40a82448 6282 ST.me = scan;
24d3c4a9 6283 ST.logical = logical;
24d786f4
YO
6284 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6285
40a82448 6286 /* execute body of (?...A) */
37f53970 6287 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6288 assert(0); /* NOTREACHED */
37f53970 6289 }
40a82448
DM
6290
6291 case IFMATCH_A_fail: /* body of (?...A) failed */
6292 ST.wanted = !ST.wanted;
6293 /* FALL THROUGH */
6294
6295 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6296 if (ST.logical) {
f2338a2e 6297 sw = cBOOL(ST.wanted);
40a82448
DM
6298 }
6299 else if (!ST.wanted)
6300 sayNO;
6301
37f53970
DM
6302 if (OP(ST.me) != SUSPEND) {
6303 /* restore old position except for (?>...) */
6304 locinput = st->locinput;
40a82448
DM
6305 }
6306 scan = ST.me + ARG(ST.me);
6307 if (scan == ST.me)
6308 scan = NULL;
6309 continue; /* execute B */
6310
6311#undef ST
dad79028 6312
3c0563b9
DM
6313 case LONGJMP: /* alternative with many branches compiles to
6314 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6315 next = scan + ARG(scan);
6316 if (next == scan)
6317 next = NULL;
a0d0e21e 6318 break;
3c0563b9
DM
6319
6320 case COMMIT: /* (*COMMIT) */
e2e6a0f1
YO
6321 reginfo->cutpoint = PL_regeol;
6322 /* FALLTHROUGH */
3c0563b9
DM
6323
6324 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6325 if (!scan->flags)
ad64d0ec 6326 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6327 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6328 assert(0); /* NOTREACHED */
3c0563b9 6329
54612592
YO
6330 case COMMIT_next_fail:
6331 no_final = 1;
6332 /* FALLTHROUGH */
3c0563b9
DM
6333
6334 case OPFAIL: /* (*FAIL) */
7f69552c 6335 sayNO;
118e2215 6336 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6337
6338#define ST st->u.mark
3c0563b9 6339 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6340 ST.prev_mark = mark_state;
5d458dd8 6341 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6342 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6343 mark_state = st;
4d5016e5
DM
6344 ST.mark_loc = locinput;
6345 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6346 assert(0); /* NOTREACHED */
3c0563b9 6347
e2e6a0f1
YO
6348 case MARKPOINT_next:
6349 mark_state = ST.prev_mark;
6350 sayYES;
118e2215 6351 assert(0); /* NOTREACHED */
3c0563b9 6352
e2e6a0f1 6353 case MARKPOINT_next_fail:
5d458dd8 6354 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6355 {
6356 if (ST.mark_loc > startpoint)
6357 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6358 popmark = NULL; /* we found our mark */
6359 sv_commit = ST.mark_name;
6360
6361 DEBUG_EXECUTE_r({
5d458dd8 6362 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6363 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6364 REPORT_CODE_OFF+depth*2, "",
be2597df 6365 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6366 });
6367 }
6368 mark_state = ST.prev_mark;
5d458dd8
YO
6369 sv_yes_mark = mark_state ?
6370 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6371 sayNO;
118e2215 6372 assert(0); /* NOTREACHED */
3c0563b9
DM
6373
6374 case SKIP: /* (*SKIP) */
5d458dd8 6375 if (scan->flags) {
2bf803e2 6376 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6377 ST.mark_name = NULL;
e2e6a0f1 6378 ST.mark_loc = locinput;
4d5016e5 6379 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6380 } else {
2bf803e2 6381 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6382 otherwise do nothing. Meaning we need to scan
6383 */
6384 regmatch_state *cur = mark_state;
ad64d0ec 6385 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6386
6387 while (cur) {
6388 if ( sv_eq( cur->u.mark.mark_name,
6389 find ) )
6390 {
6391 ST.mark_name = find;
4d5016e5 6392 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6393 }
6394 cur = cur->u.mark.prev_mark;
6395 }
e2e6a0f1 6396 }
2bf803e2 6397 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6398 break;
3c0563b9 6399
5d458dd8
YO
6400 case SKIP_next_fail:
6401 if (ST.mark_name) {
6402 /* (*CUT:NAME) - Set up to search for the name as we
6403 collapse the stack*/
6404 popmark = ST.mark_name;
6405 } else {
6406 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6407 if (ST.mark_loc > startpoint)
6408 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6409 /* but we set sv_commit to latest mark_name if there
6410 is one so they can test to see how things lead to this
6411 cut */
6412 if (mark_state)
6413 sv_commit=mark_state->u.mark.mark_name;
6414 }
e2e6a0f1
YO
6415 no_final = 1;
6416 sayNO;
118e2215 6417 assert(0); /* NOTREACHED */
e2e6a0f1 6418#undef ST
3c0563b9
DM
6419
6420 case LNBREAK: /* \R */
7016d6eb 6421 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
e1d1eefb 6422 locinput += n;
e1d1eefb
YO
6423 } else
6424 sayNO;
6425 break;
6426
a0d0e21e 6427 default:
b900a521 6428 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6429 PTR2UV(scan), OP(scan));
cea2e8a9 6430 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6431
6432 /* this is a point to jump to in order to increment
6433 * locinput by one character */
6434 increment_locinput:
e6ca698c 6435 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6436 if (utf8_target) {
6437 locinput += PL_utf8skip[nextchr];
7016d6eb 6438 /* locinput is allowed to go 1 char off the end, but not 2+ */
28b98f76
DM
6439 if (locinput > PL_regeol)
6440 sayNO;
28b98f76
DM
6441 }
6442 else
3640db6b 6443 locinput++;
28b98f76 6444 break;
5d458dd8
YO
6445
6446 } /* end switch */
95b24440 6447
5d458dd8
YO
6448 /* switch break jumps here */
6449 scan = next; /* prepare to execute the next op and ... */
6450 continue; /* ... jump back to the top, reusing st */
118e2215 6451 assert(0); /* NOTREACHED */
95b24440 6452
40a82448
DM
6453 push_yes_state:
6454 /* push a state that backtracks on success */
6455 st->u.yes.prev_yes_state = yes_state;
6456 yes_state = st;
6457 /* FALL THROUGH */
6458 push_state:
6459 /* push a new regex state, then continue at scan */
6460 {
6461 regmatch_state *newst;
6462
24b23f37
YO
6463 DEBUG_STACK_r({
6464 regmatch_state *cur = st;
6465 regmatch_state *curyes = yes_state;
6466 int curd = depth;
6467 regmatch_slab *slab = PL_regmatch_slab;
6468 for (;curd > -1;cur--,curd--) {
6469 if (cur < SLAB_FIRST(slab)) {
6470 slab = slab->prev;
6471 cur = SLAB_LAST(slab);
6472 }
6473 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6474 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6475 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6476 (curyes == cur) ? "yes" : ""
6477 );
6478 if (curyes == cur)
6479 curyes = cur->u.yes.prev_yes_state;
6480 }
6481 } else
6482 DEBUG_STATE_pp("push")
6483 );
40a82448 6484 depth++;
40a82448
DM
6485 st->locinput = locinput;
6486 newst = st+1;
6487 if (newst > SLAB_LAST(PL_regmatch_slab))
6488 newst = S_push_slab(aTHX);
6489 PL_regmatch_state = newst;
786e8c11 6490
4d5016e5 6491 locinput = pushinput;
40a82448
DM
6492 st = newst;
6493 continue;
118e2215 6494 assert(0); /* NOTREACHED */
40a82448 6495 }
a0d0e21e 6496 }
a687059c 6497
a0d0e21e
LW
6498 /*
6499 * We get here only if there's trouble -- normally "case END" is
6500 * the terminating point.
6501 */
cea2e8a9 6502 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6503 /*NOTREACHED*/
4633a7c4
LW
6504 sayNO;
6505
262b90c4 6506yes:
77cb431f
DM
6507 if (yes_state) {
6508 /* we have successfully completed a subexpression, but we must now
6509 * pop to the state marked by yes_state and continue from there */
77cb431f 6510 assert(st != yes_state);
5bc10b2c
DM
6511#ifdef DEBUGGING
6512 while (st != yes_state) {
6513 st--;
6514 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6515 PL_regmatch_slab = PL_regmatch_slab->prev;
6516 st = SLAB_LAST(PL_regmatch_slab);
6517 }
e2e6a0f1 6518 DEBUG_STATE_r({
54612592
YO
6519 if (no_final) {
6520 DEBUG_STATE_pp("pop (no final)");
6521 } else {
6522 DEBUG_STATE_pp("pop (yes)");
6523 }
e2e6a0f1 6524 });
5bc10b2c
DM
6525 depth--;
6526 }
6527#else
77cb431f
DM
6528 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6529 || yes_state > SLAB_LAST(PL_regmatch_slab))
6530 {
6531 /* not in this slab, pop slab */
6532 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6533 PL_regmatch_slab = PL_regmatch_slab->prev;
6534 st = SLAB_LAST(PL_regmatch_slab);
6535 }
6536 depth -= (st - yes_state);
5bc10b2c 6537#endif
77cb431f
DM
6538 st = yes_state;
6539 yes_state = st->u.yes.prev_yes_state;
6540 PL_regmatch_state = st;
24b23f37 6541
3640db6b 6542 if (no_final)
5d458dd8 6543 locinput= st->locinput;
54612592 6544 state_num = st->resume_state + no_final;
24d3c4a9 6545 goto reenter_switch;
77cb431f
DM
6546 }
6547
a3621e74 6548 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6549 PL_colors[4], PL_colors[5]));
02db2b7b 6550
ed301438 6551 if (PL_reg_state.re_state_eval_setup_done) {
19b95bf0
DM
6552 /* each successfully executed (?{...}) block does the equivalent of
6553 * local $^R = do {...}
6554 * When popping the save stack, all these locals would be undone;
6555 * bypass this by setting the outermost saved $^R to the latest
6556 * value */
6557 if (oreplsv != GvSV(PL_replgv))
6558 sv_setsv(oreplsv, GvSV(PL_replgv));
6559 }
95b24440 6560 result = 1;
aa283a38 6561 goto final_exit;
4633a7c4
LW
6562
6563no:
a3621e74 6564 DEBUG_EXECUTE_r(
7821416a 6565 PerlIO_printf(Perl_debug_log,
786e8c11 6566 "%*s %sfailed...%s\n",
5bc10b2c 6567 REPORT_CODE_OFF+depth*2, "",
786e8c11 6568 PL_colors[4], PL_colors[5])
7821416a 6569 );
aa283a38 6570
262b90c4 6571no_silent:
54612592
YO
6572 if (no_final) {
6573 if (yes_state) {
6574 goto yes;
6575 } else {
6576 goto final_exit;
6577 }
6578 }
aa283a38
DM
6579 if (depth) {
6580 /* there's a previous state to backtrack to */
40a82448
DM
6581 st--;
6582 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6583 PL_regmatch_slab = PL_regmatch_slab->prev;
6584 st = SLAB_LAST(PL_regmatch_slab);
6585 }
6586 PL_regmatch_state = st;
40a82448 6587 locinput= st->locinput;
40a82448 6588
5bc10b2c
DM
6589 DEBUG_STATE_pp("pop");
6590 depth--;
262b90c4
DM
6591 if (yes_state == st)
6592 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6593
24d3c4a9
DM
6594 state_num = st->resume_state + 1; /* failure = success + 1 */
6595 goto reenter_switch;
95b24440 6596 }
24d3c4a9 6597 result = 0;
aa283a38 6598
262b90c4 6599 final_exit:
bbe252da 6600 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6601 SV *sv_err = get_sv("REGERROR", 1);
6602 SV *sv_mrk = get_sv("REGMARK", 1);
6603 if (result) {
e2e6a0f1 6604 sv_commit = &PL_sv_no;
5d458dd8
YO
6605 if (!sv_yes_mark)
6606 sv_yes_mark = &PL_sv_yes;
6607 } else {
6608 if (!sv_commit)
6609 sv_commit = &PL_sv_yes;
6610 sv_yes_mark = &PL_sv_no;
6611 }
6612 sv_setsv(sv_err, sv_commit);
6613 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 6614 }
19b95bf0 6615
81ed78b2
DM
6616
6617 if (last_pushed_cv) {
6618 dSP;
6619 POP_MULTICALL;
4f8dbb2d 6620 PERL_UNUSED_VAR(SP);
81ed78b2
DM
6621 }
6622
2f554ef7
DM
6623 /* clean up; in particular, free all slabs above current one */
6624 LEAVE_SCOPE(oldsave);
5d9a96ca 6625
730f4c74
DM
6626 assert(!result || locinput - PL_bostr >= 0);
6627 return result ? locinput - PL_bostr : -1;
a687059c
LW
6628}
6629
6630/*
6631 - regrepeat - repeatedly match something simple, report how many
d60de1d1 6632 *
e64f369d
KW
6633 * What 'simple' means is a node which can be the operand of a quantifier like
6634 * '+', or {1,3}
6635 *
d60de1d1
DM
6636 * startposp - pointer a pointer to the start position. This is updated
6637 * to point to the byte following the highest successful
6638 * match.
6639 * p - the regnode to be repeatedly matched against.
4063ade8 6640 * max - maximum number of things to match.
d60de1d1 6641 * depth - (for debugging) backtracking depth.
a687059c 6642 */
76e3520e 6643STATIC I32
272d35c9 6644S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
984e6dd1 6645 I32 max, int depth, bool is_utf8_pat)
a687059c 6646{
27da23d5 6647 dVAR;
4063ade8 6648 char *scan; /* Pointer to current position in target string */
eb578fdb 6649 I32 c;
4063ade8
KW
6650 char *loceol = PL_regeol; /* local version */
6651 I32 hardcount = 0; /* How many matches so far */
eb578fdb 6652 bool utf8_target = PL_reg_match_utf8;
3018b823 6653 int to_complement = 0; /* Invert the result? */
d513472c 6654 UV utf8_flags;
3018b823 6655 _char_class_number classnum;
4f55667c
SP
6656#ifndef DEBUGGING
6657 PERL_UNUSED_ARG(depth);
6658#endif
a0d0e21e 6659
7918f24d
NC
6660 PERL_ARGS_ASSERT_REGREPEAT;
6661
f73aaa43 6662 scan = *startposp;
faf11cac
HS
6663 if (max == REG_INFTY)
6664 max = I32_MAX;
4063ade8 6665 else if (! utf8_target && scan + max < loceol)
7f596f4c 6666 loceol = scan + max;
4063ade8
KW
6667
6668 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6669 * to the maximum of how far we should go in it (leaving it set to the real
6670 * end, if the maximum permissible would take us beyond that). This allows
6671 * us to make the loop exit condition that we haven't gone past <loceol> to
6672 * also mean that we haven't exceeded the max permissible count, saving a
6673 * test each time through the loop. But it assumes that the OP matches a
6674 * single byte, which is true for most of the OPs below when applied to a
6675 * non-UTF-8 target. Those relatively few OPs that don't have this
6676 * characteristic will have to compensate.
6677 *
6678 * There is no adjustment for UTF-8 targets, as the number of bytes per
6679 * character varies. OPs will have to test both that the count is less
6680 * than the max permissible (using <hardcount> to keep track), and that we
6681 * are still within the bounds of the string (using <loceol>. A few OPs
6682 * match a single byte no matter what the encoding. They can omit the max
6683 * test if, for the UTF-8 case, they do the adjustment that was skipped
6684 * above.
6685 *
6686 * Thus, the code above sets things up for the common case; and exceptional
6687 * cases need extra work; the common case is to make sure <scan> doesn't
6688 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6689 * count doesn't exceed the maximum permissible */
6690
a0d0e21e 6691 switch (OP(p)) {
22c35a8c 6692 case REG_ANY:
f2ed9b32 6693 if (utf8_target) {
1aa99e6b 6694 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
6695 scan += UTF8SKIP(scan);
6696 hardcount++;
6697 }
6698 } else {
6699 while (scan < loceol && *scan != '\n')
6700 scan++;
a0ed51b3
LW
6701 }
6702 break;
ffc61ed2 6703 case SANY:
f2ed9b32 6704 if (utf8_target) {
a0804c9e 6705 while (scan < loceol && hardcount < max) {
def8e4ea
JH
6706 scan += UTF8SKIP(scan);
6707 hardcount++;
6708 }
6709 }
6710 else
6711 scan = loceol;
a0ed51b3 6712 break;
4063ade8
KW
6713 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6714 if (utf8_target && scan + max < loceol) {
6715
6716 /* <loceol> hadn't been adjusted in the UTF-8 case */
6717 scan += max;
6718 }
6719 else {
6720 scan = loceol;
6721 }
f33976b4 6722 break;
59d32103 6723 case EXACT:
984e6dd1 6724 assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6725
59d32103 6726 c = (U8)*STRING(p);
59d32103 6727
5e4a1da1
KW
6728 /* Can use a simple loop if the pattern char to match on is invariant
6729 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6730 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6731 * true iff it doesn't matter if the argument is in UTF-8 or not */
984e6dd1 6732 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! is_utf8_pat)) {
4063ade8
KW
6733 if (utf8_target && scan + max < loceol) {
6734 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6735 * since here, to match at all, 1 char == 1 byte */
6736 loceol = scan + max;
6737 }
59d32103
KW
6738 while (scan < loceol && UCHARAT(scan) == c) {
6739 scan++;
6740 }
6741 }
984e6dd1 6742 else if (is_utf8_pat) {
5e4a1da1
KW
6743 if (utf8_target) {
6744 STRLEN scan_char_len;
5e4a1da1 6745
4063ade8 6746 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
6747 * string EQ */
6748 while (hardcount < max
9a902117
KW
6749 && scan < loceol
6750 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
6751 && memEQ(scan, STRING(p), scan_char_len))
6752 {
4200a00c 6753 scan += scan_char_len;
5e4a1da1
KW
6754 hardcount++;
6755 }
6756 }
6757 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 6758
5e4a1da1
KW
6759 /* Target isn't utf8; convert the character in the UTF-8
6760 * pattern to non-UTF8, and do a simple loop */
6761 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6762 while (scan < loceol && UCHARAT(scan) == c) {
6763 scan++;
6764 }
6765 } /* else pattern char is above Latin1, can't possibly match the
6766 non-UTF-8 target */
b40a2c17 6767 }
5e4a1da1 6768 else {
59d32103 6769
5e4a1da1
KW
6770 /* Here, the string must be utf8; pattern isn't, and <c> is
6771 * different in utf8 than not, so can't compare them directly.
6772 * Outside the loop, find the two utf8 bytes that represent c, and
6773 * then look for those in sequence in the utf8 string */
59d32103
KW
6774 U8 high = UTF8_TWO_BYTE_HI(c);
6775 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
6776
6777 while (hardcount < max
6778 && scan + 1 < loceol
6779 && UCHARAT(scan) == high
6780 && UCHARAT(scan + 1) == low)
6781 {
6782 scan += 2;
6783 hardcount++;
6784 }
6785 }
6786 break;
5e4a1da1 6787
2f7f8cb1
KW
6788 case EXACTFA:
6789 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6790 goto do_exactf;
6791
d4e0b827 6792 case EXACTFL:
272d35c9 6793 RXp_MATCH_TAINTED_on(prog);
17580e7a
KW
6794 utf8_flags = FOLDEQ_UTF8_LOCALE;
6795 goto do_exactf;
6796
d4e0b827 6797 case EXACTF:
62bf7766
KW
6798 utf8_flags = 0;
6799 goto do_exactf;
6800
3c760661 6801 case EXACTFU_SS:
fab2782b 6802 case EXACTFU_TRICKYFOLD:
9a5a5549 6803 case EXACTFU:
984e6dd1 6804 utf8_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 6805
613a425d
KW
6806 do_exactf: {
6807 int c1, c2;
6808 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 6809
984e6dd1 6810 assert(STR_LEN(p) == is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 6811
984e6dd1
DM
6812 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6813 is_utf8_pat))
6814 {
613a425d 6815 if (c1 == CHRTEST_VOID) {
49b95fad 6816 /* Use full Unicode fold matching */
4063ade8 6817 char *tmpeol = PL_regeol;
984e6dd1 6818 STRLEN pat_len = is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
6819 while (hardcount < max
6820 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6821 STRING(p), NULL, pat_len,
984e6dd1 6822 is_utf8_pat, utf8_flags))
49b95fad
KW
6823 {
6824 scan = tmpeol;
4063ade8 6825 tmpeol = PL_regeol;
49b95fad
KW
6826 hardcount++;
6827 }
613a425d
KW
6828 }
6829 else if (utf8_target) {
6830 if (c1 == c2) {
4063ade8
KW
6831 while (scan < loceol
6832 && hardcount < max
613a425d
KW
6833 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6834 {
6835 scan += UTF8SKIP(scan);
6836 hardcount++;
6837 }
6838 }
6839 else {
4063ade8
KW
6840 while (scan < loceol
6841 && hardcount < max
613a425d
KW
6842 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6843 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6844 {
6845 scan += UTF8SKIP(scan);
6846 hardcount++;
6847 }
6848 }
6849 }
6850 else if (c1 == c2) {
6851 while (scan < loceol && UCHARAT(scan) == c1) {
6852 scan++;
6853 }
6854 }
6855 else {
6856 while (scan < loceol &&
6857 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6858 {
6859 scan++;
6860 }
6861 }
634c83a2 6862 }
bbce6d69 6863 break;
613a425d 6864 }
a0d0e21e 6865 case ANYOF:
954a2af6 6866 case ANYOF_WARN_SUPER:
e0193e47 6867 if (utf8_target) {
4e8910e0 6868 while (hardcount < max
9a902117 6869 && scan < loceol
635cd5d4 6870 && reginclass(prog, p, (U8*)scan, utf8_target))
4e8910e0 6871 {
9a902117 6872 scan += UTF8SKIP(scan);
ffc61ed2
JH
6873 hardcount++;
6874 }
6875 } else {
32fc9b6a 6876 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
6877 scan++;
6878 }
a0d0e21e 6879 break;
4063ade8 6880
3018b823 6881 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 6882
3018b823
KW
6883 case NPOSIXL:
6884 to_complement = 1;
6885 /* FALLTHROUGH */
980866de 6886
3018b823 6887 case POSIXL:
272d35c9 6888 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
6889 if (! utf8_target) {
6890 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6891 *scan)))
a12cf05f 6892 {
3018b823
KW
6893 scan++;
6894 }
6895 } else {
6896 while (hardcount < max && scan < loceol
6897 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6898 (U8 *) scan)))
6899 {
6900 scan += UTF8SKIP(scan);
ffc61ed2
JH
6901 hardcount++;
6902 }
a0ed51b3
LW
6903 }
6904 break;
0658cdde 6905
3018b823
KW
6906 case POSIXD:
6907 if (utf8_target) {
6908 goto utf8_posix;
6909 }
6910 /* FALLTHROUGH */
6911
0658cdde 6912 case POSIXA:
4063ade8
KW
6913 if (utf8_target && scan + max < loceol) {
6914
7aee35ff
KW
6915 /* We didn't adjust <loceol> at the beginning of this routine
6916 * because is UTF-8, but it is actually ok to do so, since here, to
6917 * match, 1 char == 1 byte. */
4063ade8
KW
6918 loceol = scan + max;
6919 }
6920 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
6921 scan++;
6922 }
6923 break;
980866de 6924
3018b823
KW
6925 case NPOSIXD:
6926 if (utf8_target) {
6927 to_complement = 1;
6928 goto utf8_posix;
6929 }
6930 /* FALL THROUGH */
980866de 6931
3018b823
KW
6932 case NPOSIXA:
6933 if (! utf8_target) {
6934 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
6935 scan++;
6936 }
4063ade8 6937 }
3018b823 6938 else {
980866de 6939
3018b823
KW
6940 /* The complement of something that matches only ASCII matches all
6941 * UTF-8 variant code points, plus everything in ASCII that isn't
6942 * in the class. */
bedac28b 6943 while (hardcount < max && scan < loceol
3018b823
KW
6944 && (! UTF8_IS_INVARIANT(*scan)
6945 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 6946 {
3018b823 6947 scan += UTF8SKIP(scan);
ffc61ed2
JH
6948 hardcount++;
6949 }
3018b823
KW
6950 }
6951 break;
980866de 6952
3018b823
KW
6953 case NPOSIXU:
6954 to_complement = 1;
6955 /* FALLTHROUGH */
6956
6957 case POSIXU:
6958 if (! utf8_target) {
6959 while (scan < loceol && to_complement
6960 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 6961 {
3018b823
KW
6962 scan++;
6963 }
cfaf538b
KW
6964 }
6965 else {
3018b823
KW
6966 utf8_posix:
6967 classnum = (_char_class_number) FLAGS(p);
6968 if (classnum < _FIRST_NON_SWASH_CC) {
6969
6970 /* Here, a swash is needed for above-Latin1 code points.
6971 * Process as many Latin1 code points using the built-in rules.
6972 * Go to another loop to finish processing upon encountering
6973 * the first Latin1 code point. We could do that in this loop
6974 * as well, but the other way saves having to test if the swash
6975 * has been loaded every time through the loop: extra space to
6976 * save a test. */
6977 while (hardcount < max && scan < loceol) {
6978 if (UTF8_IS_INVARIANT(*scan)) {
6979 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6980 classnum))))
6981 {
6982 break;
6983 }
6984 scan++;
6985 }
6986 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6987 if (! (to_complement
6988 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6989 *(scan + 1)),
6990 classnum))))
6991 {
6992 break;
6993 }
6994 scan += 2;
6995 }
6996 else {
6997 goto found_above_latin1;
6998 }
6999
7000 hardcount++;
7001 }
7002 }
7003 else {
7004 /* For these character classes, the knowledge of how to handle
7005 * every code point is compiled in to Perl via a macro. This
7006 * code is written for making the loops as tight as possible.
7007 * It could be refactored to save space instead */
7008 switch (classnum) {
7009 case _CC_ENUM_SPACE: /* XXX would require separate code
7010 if we revert the change of \v
7011 matching this */
7012 /* FALL THROUGH */
7013 case _CC_ENUM_PSXSPC:
7014 while (hardcount < max
7015 && scan < loceol
7016 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7017 {
7018 scan += UTF8SKIP(scan);
7019 hardcount++;
7020 }
7021 break;
7022 case _CC_ENUM_BLANK:
7023 while (hardcount < max
7024 && scan < loceol
7025 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7026 {
7027 scan += UTF8SKIP(scan);
7028 hardcount++;
7029 }
7030 break;
7031 case _CC_ENUM_XDIGIT:
7032 while (hardcount < max
7033 && scan < loceol
7034 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7035 {
7036 scan += UTF8SKIP(scan);
7037 hardcount++;
7038 }
7039 break;
7040 case _CC_ENUM_VERTSPACE:
7041 while (hardcount < max
7042 && scan < loceol
7043 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7044 {
7045 scan += UTF8SKIP(scan);
7046 hardcount++;
7047 }
7048 break;
7049 case _CC_ENUM_CNTRL:
7050 while (hardcount < max
7051 && scan < loceol
7052 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7053 {
7054 scan += UTF8SKIP(scan);
7055 hardcount++;
7056 }
7057 break;
7058 default:
7059 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7060 }
7061 }
a0ed51b3 7062 }
3018b823 7063 break;
4063ade8 7064
3018b823
KW
7065 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7066
7067 /* Load the swash if not already present */
7068 if (! PL_utf8_swash_ptrs[classnum]) {
7069 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7070 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7071 "utf8", swash_property_names[classnum],
7072 &PL_sv_undef, 1, 0, NULL, &flags);
4063ade8 7073 }
3018b823
KW
7074
7075 while (hardcount < max && scan < loceol
7076 && to_complement ^ cBOOL(_generic_utf8(
7077 classnum,
7078 scan,
7079 swash_fetch(PL_utf8_swash_ptrs[classnum],
7080 (U8 *) scan,
7081 TRUE))))
7082 {
7083 scan += UTF8SKIP(scan);
7084 hardcount++;
7085 }
7086 break;
7087
e1d1eefb 7088 case LNBREAK:
e64f369d
KW
7089 if (utf8_target) {
7090 while (hardcount < max && scan < loceol &&
7091 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7092 scan += c;
7093 hardcount++;
7094 }
7095 } else {
7096 /* LNBREAK can match one or two latin chars, which is ok, but we
7097 * have to use hardcount in this situation, and throw away the
7098 * adjustment to <loceol> done before the switch statement */
7099 loceol = PL_regeol;
7100 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7101 scan+=c;
7102 hardcount++;
7103 }
7104 }
7105 break;
e1d1eefb 7106
584b1f02
KW
7107 case BOUND:
7108 case BOUNDA:
7109 case BOUNDL:
7110 case BOUNDU:
7111 case EOS:
7112 case GPOS:
7113 case KEEPS:
7114 case NBOUND:
7115 case NBOUNDA:
7116 case NBOUNDL:
7117 case NBOUNDU:
7118 case OPFAIL:
7119 case SBOL:
7120 case SEOL:
7121 /* These are all 0 width, so match right here or not at all. */
7122 break;
7123
7124 default:
7125 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7126 assert(0); /* NOTREACHED */
7127
a0d0e21e 7128 }
a687059c 7129
a0ed51b3
LW
7130 if (hardcount)
7131 c = hardcount;
7132 else
f73aaa43
DM
7133 c = scan - *startposp;
7134 *startposp = scan;
a687059c 7135
a3621e74 7136 DEBUG_r({
e68ec53f 7137 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7138 DEBUG_EXECUTE_r({
e68ec53f
YO
7139 SV * const prop = sv_newmortal();
7140 regprop(prog, prop, p);
7141 PerlIO_printf(Perl_debug_log,
be8e71aa 7142 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7143 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7144 });
be8e71aa 7145 });
9041c2e3 7146
a0d0e21e 7147 return(c);
a687059c
LW
7148}
7149
c277df42 7150
be8e71aa 7151#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7152/*
6c6525b8 7153- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7154create a copy so that changes the caller makes won't change the shared one.
7155If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7156 */
ffc61ed2 7157SV *
5aaab254 7158Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7159{
6c6525b8 7160 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7161
7162 if (altsvp) {
7163 *altsvp = NULL;
7164 }
7165
7166 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
6c6525b8
KW
7167}
7168#endif
7169
7170STATIC SV *
5aaab254 7171S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
6c6525b8 7172{
8c9eb58f
KW
7173 /* Returns the swash for the input 'node' in the regex 'prog'.
7174 * If <doinit> is true, will attempt to create the swash if not already
7175 * done.
7176 * If <listsvp> is non-null, will return the swash initialization string in
7177 * it.
8c9eb58f
KW
7178 * Tied intimately to how regcomp.c sets up the data structure */
7179
97aff369 7180 dVAR;
9e55ce06
JH
7181 SV *sw = NULL;
7182 SV *si = NULL;
7a6c6baa
KW
7183 SV* invlist = NULL;
7184
f8fc2ecf
YO
7185 RXi_GET_DECL(prog,progi);
7186 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7187
6c6525b8 7188 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7918f24d 7189
ccb2541c
KW
7190 assert(ANYOF_NONBITMAP(node));
7191
4f639d21 7192 if (data && data->count) {
a3b680e6 7193 const U32 n = ARG(node);
ffc61ed2 7194
4f639d21 7195 if (data->what[n] == 's') {
ad64d0ec
NC
7196 SV * const rv = MUTABLE_SV(data->data[n]);
7197 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7198 SV **const ary = AvARRAY(av);
87367d5f 7199 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7200
8c9eb58f 7201 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7202
88675427
KW
7203 /* Elements 2 and 3 are either both present or both absent. [2] is
7204 * any inversion list generated at compile time; [3] indicates if
7a6c6baa 7205 * that inversion list has any user-defined properties in it. */
88675427
KW
7206 if (av_len(av) >= 2) {
7207 invlist = ary[2];
7208 if (SvUV(ary[3])) {
83199d38
KW
7209 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7210 }
7a6c6baa
KW
7211 }
7212 else {
7213 invlist = NULL;
7a6c6baa
KW
7214 }
7215
8c9eb58f
KW
7216 /* Element [1] is reserved for the set-up swash. If already there,
7217 * return it; if not, create it and store it there */
f192cf32
KW
7218 if (SvROK(ary[1])) {
7219 sw = ary[1];
7220 }
ffc61ed2 7221 else if (si && doinit) {
7a6c6baa
KW
7222
7223 sw = _core_swash_init("utf8", /* the utf8 package */
7224 "", /* nameless */
7225 si,
7226 1, /* binary */
7227 0, /* not from tr/// */
7a6c6baa 7228 invlist,
83199d38 7229 &swash_init_flags);
ffc61ed2
JH
7230 (void)av_store(av, 1, sw);
7231 }
7232 }
7233 }
7234
7a6c6baa
KW
7235 if (listsvp) {
7236 SV* matches_string = newSVpvn("", 0);
7a6c6baa
KW
7237
7238 /* Use the swash, if any, which has to have incorporated into it all
7239 * possibilities */
872dd7e0
KW
7240 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7241 && (si && si != &PL_sv_undef))
7242 {
7a6c6baa 7243
872dd7e0 7244 /* If no swash, use the input initialization string, if available */
7a6c6baa
KW
7245 sv_catsv(matches_string, si);
7246 }
7247
7248 /* Add the inversion list to whatever we have. This may have come from
7249 * the swash, or from an input parameter */
7250 if (invlist) {
7251 sv_catsv(matches_string, _invlist_contents(invlist));
7252 }
7253 *listsvp = matches_string;
7254 }
7255
ffc61ed2
JH
7256 return sw;
7257}
7258
7259/*
ba7b4546 7260 - reginclass - determine if a character falls into a character class
832705d4 7261
6698fab5
KW
7262 n is the ANYOF regnode
7263 p is the target string
6698fab5 7264 utf8_target tells whether p is in UTF-8.
832705d4 7265
635cd5d4 7266 Returns true if matched; false otherwise.
eba1359e 7267
d5788240
KW
7268 Note that this can be a synthetic start class, a combination of various
7269 nodes, so things you think might be mutually exclusive, such as locale,
7270 aren't. It can match both locale and non-locale
7271
bbce6d69 7272 */
7273
76e3520e 7274STATIC bool
272d35c9 7275S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
bbce6d69 7276{
27da23d5 7277 dVAR;
a3b680e6 7278 const char flags = ANYOF_FLAGS(n);
bbce6d69 7279 bool match = FALSE;
cc07378b 7280 UV c = *p;
1aa99e6b 7281
7918f24d
NC
7282 PERL_ARGS_ASSERT_REGINCLASS;
7283
afd2eb18
KW
7284 /* If c is not already the code point, get it. Note that
7285 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7286 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7287 STRLEN c_len = 0;
f7ab54c6 7288 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6182169b
KW
7289 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7290 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7291 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7292 * UTF8_ALLOW_FFFF */
f7ab54c6 7293 if (c_len == (STRLEN)-1)
e8a70c6f 7294 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7295 }
4b3cda86 7296
7cdde544
KW
7297 /* If this character is potentially in the bitmap, check it */
7298 if (c < 256) {
ffc61ed2
JH
7299 if (ANYOF_BITMAP_TEST(n, c))
7300 match = TRUE;
11454c59
KW
7301 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7302 && ! utf8_target
7303 && ! isASCII(c))
7304 {
7305 match = TRUE;
7306 }
78969a98 7307 else if (flags & ANYOF_LOCALE) {
272d35c9 7308 RXp_MATCH_TAINTED_on(prog);
78969a98 7309
538b546e 7310 if ((flags & ANYOF_LOC_FOLD)
78969a98
KW
7311 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7312 {
ffc61ed2 7313 match = TRUE;
78969a98 7314 }
31c7f561
KW
7315 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7316
7317 /* The data structure is arranged so bits 0, 2, 4, ... are set
7318 * if the class includes the Posix character class given by
7319 * bit/2; and 1, 3, 5, ... are set if the class includes the
7320 * complemented Posix class given by int(bit/2). So we loop
7321 * through the bits, each time changing whether we complement
7322 * the result or not. Suppose for the sake of illustration
7323 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7324 * is set, it means there is a match for this ANYOF node if the
7325 * character is in the class given by the expression (0 / 2 = 0
7326 * = \w). If it is in that class, isFOO_lc() will return 1,
7327 * and since 'to_complement' is 0, the result will stay TRUE,
7328 * and we exit the loop. Suppose instead that bit 0 is 0, but
7329 * bit 1 is 1. That means there is a match if the character
7330 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7331 * but will on bit 1. On the second iteration 'to_complement'
7332 * will be 1, so the exclusive or will reverse things, so we
7333 * are testing for \W. On the third iteration, 'to_complement'
7334 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
7335 * iteration would test for \S, etc.
7336 *
7337 * Note that this code assumes that all the classes are closed
7338 * under folding. For example, if a character matches \w, then
7339 * its fold does too; and vice versa. This should be true for
7340 * any well-behaved locale for all the currently defined Posix
7341 * classes, except for :lower: and :upper:, which are handled
7342 * by the pseudo-class :cased: which matches if either of the
7343 * other two does. To get rid of this assumption, an outer
7344 * loop could be used below to iterate over both the source
7345 * character, and its fold (if different) */
31c7f561
KW
7346
7347 int count = 0;
7348 int to_complement = 0;
7349 while (count < ANYOF_MAX) {
7350 if (ANYOF_CLASS_TEST(n, count)
7351 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7352 {
7353 match = TRUE;
7354 break;
7355 }
7356 count++;
7357 to_complement ^= 1;
7358 }
ffc61ed2 7359 }
a0ed51b3 7360 }
a0ed51b3
LW
7361 }
7362
7cdde544 7363 /* If the bitmap didn't (or couldn't) match, and something outside the
1f327b5e 7364 * bitmap could match, try that. Locale nodes specify completely the
de87c4fe 7365 * behavior of code points in the bit map (otherwise, a utf8 target would
c613755a 7366 * cause them to be treated as Unicode and not locale), except in
de87c4fe 7367 * the very unlikely event when this node is a synthetic start class, which
c613755a
KW
7368 * could be a combination of locale and non-locale nodes. So allow locale
7369 * to match for the synthetic start class, which will give a false
7370 * positive that will be resolved when the match is done again as not part
7371 * of the synthetic start class */
ef87b810 7372 if (!match) {
10ee90d2
KW
7373 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7374 match = TRUE; /* Everything above 255 matches */
e051a21d 7375 }
6f8d7d0d
KW
7376 else if (ANYOF_NONBITMAP(n)
7377 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7378 || (utf8_target
7379 && (c >=256
7380 || (! (flags & ANYOF_LOCALE))
9aa1e39f 7381 || OP(n) == ANYOF_SYNTHETIC))))
ef87b810 7382 {
e0193e47 7383 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7cdde544
KW
7384 if (sw) {
7385 U8 * utf8_p;
7386 if (utf8_target) {
7387 utf8_p = (U8 *) p;
e0193e47
KW
7388 } else { /* Convert to utf8 */
7389 STRLEN len = 1;
7cdde544
KW
7390 utf8_p = bytes_to_utf8(p, &len);
7391 }
f56b6394 7392
e0193e47 7393 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7394 match = TRUE;
e0193e47 7395 }
7cdde544
KW
7396
7397 /* If we allocated a string above, free it */
7398 if (! utf8_target) Safefree(utf8_p);
7399 }
7400 }
5073ffbd
KW
7401
7402 if (UNICODE_IS_SUPER(c)
954a2af6 7403 && OP(n) == ANYOF_WARN_SUPER
5073ffbd
KW
7404 && ckWARN_d(WARN_NON_UNICODE))
7405 {
7406 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7407 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7408 }
7cdde544
KW
7409 }
7410
f0fdc1c9
KW
7411 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7412 return cBOOL(flags & ANYOF_INVERT) ^ match;
a0ed51b3 7413}
161b471a 7414
dfe13c55 7415STATIC U8 *
0ce71af7 7416S_reghop3(U8 *s, I32 off, const U8* lim)
9041c2e3 7417{
6af86488
KW
7418 /* return the position 'off' UTF-8 characters away from 's', forward if
7419 * 'off' >= 0, backwards if negative. But don't go outside of position
7420 * 'lim', which better be < s if off < 0 */
7421
97aff369 7422 dVAR;
7918f24d
NC
7423
7424 PERL_ARGS_ASSERT_REGHOP3;
7425
a0ed51b3 7426 if (off >= 0) {
1aa99e6b 7427 while (off-- && s < lim) {
ffc61ed2 7428 /* XXX could check well-formedness here */
a0ed51b3 7429 s += UTF8SKIP(s);
ffc61ed2 7430 }
a0ed51b3
LW
7431 }
7432 else {
1de06328
YO
7433 while (off++ && s > lim) {
7434 s--;
7435 if (UTF8_IS_CONTINUED(*s)) {
7436 while (s > lim && UTF8_IS_CONTINUATION(*s))
7437 s--;
a0ed51b3 7438 }
1de06328 7439 /* XXX could check well-formedness here */
a0ed51b3
LW
7440 }
7441 }
7442 return s;
7443}
161b471a 7444
f9f4320a
YO
7445#ifdef XXX_dmq
7446/* there are a bunch of places where we use two reghop3's that should
7447 be replaced with this routine. but since thats not done yet
7448 we ifdef it out - dmq
7449*/
dfe13c55 7450STATIC U8 *
1de06328
YO
7451S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7452{
7453 dVAR;
7918f24d
NC
7454
7455 PERL_ARGS_ASSERT_REGHOP4;
7456
1de06328
YO
7457 if (off >= 0) {
7458 while (off-- && s < rlim) {
7459 /* XXX could check well-formedness here */
7460 s += UTF8SKIP(s);
7461 }
7462 }
7463 else {
7464 while (off++ && s > llim) {
7465 s--;
7466 if (UTF8_IS_CONTINUED(*s)) {
7467 while (s > llim && UTF8_IS_CONTINUATION(*s))
7468 s--;
7469 }
7470 /* XXX could check well-formedness here */
7471 }
7472 }
7473 return s;
7474}
f9f4320a 7475#endif
1de06328
YO
7476
7477STATIC U8 *
0ce71af7 7478S_reghopmaybe3(U8* s, I32 off, const U8* lim)
a0ed51b3 7479{
97aff369 7480 dVAR;
7918f24d
NC
7481
7482 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7483
a0ed51b3 7484 if (off >= 0) {
1aa99e6b 7485 while (off-- && s < lim) {
ffc61ed2 7486 /* XXX could check well-formedness here */
a0ed51b3 7487 s += UTF8SKIP(s);
ffc61ed2 7488 }
a0ed51b3 7489 if (off >= 0)
3dab1dad 7490 return NULL;
a0ed51b3
LW
7491 }
7492 else {
1de06328
YO
7493 while (off++ && s > lim) {
7494 s--;
7495 if (UTF8_IS_CONTINUED(*s)) {
7496 while (s > lim && UTF8_IS_CONTINUATION(*s))
7497 s--;
a0ed51b3 7498 }
1de06328 7499 /* XXX could check well-formedness here */
a0ed51b3
LW
7500 }
7501 if (off <= 0)
3dab1dad 7502 return NULL;
a0ed51b3
LW
7503 }
7504 return s;
7505}
51371543 7506
51371543 7507static void
acfe0abc 7508restore_pos(pTHX_ void *arg)
51371543 7509{
97aff369 7510 dVAR;
097eb12c 7511 regexp * const rex = (regexp *)arg;
ed301438 7512 if (PL_reg_state.re_state_eval_setup_done) {
51371543 7513 if (PL_reg_oldsaved) {
4f639d21
DM
7514 rex->subbeg = PL_reg_oldsaved;
7515 rex->sublen = PL_reg_oldsavedlen;
6502e081
DM
7516 rex->suboffset = PL_reg_oldsavedoffset;
7517 rex->subcoffset = PL_reg_oldsavedcoffset;
db2c6cb3 7518#ifdef PERL_ANY_COW
4f639d21 7519 rex->saved_copy = PL_nrs;
ed252734 7520#endif
07bc277f 7521 RXp_MATCH_COPIED_on(rex);
51371543
GS
7522 }
7523 PL_reg_magic->mg_len = PL_reg_oldpos;
ed301438 7524 PL_reg_state.re_state_eval_setup_done = FALSE;
51371543
GS
7525 PL_curpm = PL_reg_oldcurpm;
7526 }
7527}
33b8afdf
JH
7528
7529STATIC void
5aaab254 7530S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 7531{
7e0d5ad7
KW
7532 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7533 * on the converted value */
7534
a1cac82e 7535 int i = 1;
7918f24d
NC
7536
7537 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7538
a1cac82e
NC
7539 do {
7540 if (prog->substrs->data[i].substr
7541 && !prog->substrs->data[i].utf8_substr) {
7542 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7543 prog->substrs->data[i].utf8_substr = sv;
7544 sv_utf8_upgrade(sv);
610460f9 7545 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 7546 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
7547 /* Trim the trailing \n that fbm_compile added last
7548 time. */
7549 SvCUR_set(sv, SvCUR(sv) - 1);
7550 /* Whilst this makes the SV technically "invalid" (as its
7551 buffer is no longer followed by "\0") when fbm_compile()
7552 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
7553 fbm_compile(sv, FBMcf_TAIL);
7554 } else
7555 fbm_compile(sv, 0);
610460f9 7556 }
a1cac82e
NC
7557 if (prog->substrs->data[i].substr == prog->check_substr)
7558 prog->check_utf8 = sv;
7559 }
7560 } while (i--);
33b8afdf
JH
7561}
7562
7e0d5ad7 7563STATIC bool
5aaab254 7564S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 7565{
7e0d5ad7
KW
7566 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7567 * on the converted value; returns FALSE if can't be converted. */
7568
97aff369 7569 dVAR;
a1cac82e 7570 int i = 1;
7918f24d
NC
7571
7572 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7573
a1cac82e
NC
7574 do {
7575 if (prog->substrs->data[i].utf8_substr
7576 && !prog->substrs->data[i].substr) {
7577 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
7578 if (! sv_utf8_downgrade(sv, TRUE)) {
7579 return FALSE;
7580 }
5400f398
KW
7581 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7582 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7583 /* Trim the trailing \n that fbm_compile added last
7584 time. */
7585 SvCUR_set(sv, SvCUR(sv) - 1);
7586 fbm_compile(sv, FBMcf_TAIL);
7587 } else
7588 fbm_compile(sv, 0);
7589 }
a1cac82e
NC
7590 prog->substrs->data[i].substr = sv;
7591 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7592 prog->check_substr = sv;
33b8afdf 7593 }
a1cac82e 7594 } while (i--);
7e0d5ad7
KW
7595
7596 return TRUE;
33b8afdf 7597}
66610fdd
RGS
7598
7599/*
7600 * Local variables:
7601 * c-indentation-style: bsd
7602 * c-basic-offset: 4
14d04a33 7603 * indent-tabs-mode: nil
66610fdd
RGS
7604 * End:
7605 *
14d04a33 7606 * ex: set ts=8 sts=4 sw=4 et:
37442d52 7607 */