This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade Module-Metadata from version 1.000022 to 1.000023
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
e1cf74e3
CB
86#ifdef DEBUGGING
87/* At least one required character in the target string is expressible only in
88 * UTF-8. */
89static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
91#endif
92
93#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
95 goto target; \
96} STMT_END
97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
2d66f61e 104/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
3db24e1e 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 115#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad 117#define HOPc(pos,off) \
ba44c216 118 (char *)(reginfo->is_utf8_target \
220db18a 119 ? reghop3((U8*)pos, off, \
9d9163fb 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 121 : (U8*)(pos + off))
557f47af 122
3dab1dad 123#define HOPBACKc(pos, off) \
ba44c216 124 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
125 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
126 : (pos - off >= reginfo->strbeg) \
8e11feef 127 ? (U8*)pos - off \
3dab1dad 128 : NULL)
efb30f32 129
ba44c216 130#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 131#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 132
557f47af
DM
133/* lim must be +ve. Returns NULL on overshoot */
134#define HOPMAYBE3(pos,off,lim) \
135 (reginfo->is_utf8_target \
136 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
137 : ((U8*)pos + off <= lim) \
138 ? (U8*)pos + off \
139 : NULL)
140
8e9f2289
DM
141/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
142 * off must be >=0; args should be vars rather than expressions */
143#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
144 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
145 : (U8*)((pos + off) > lim ? lim : (pos + off)))
146
2974eaec
DM
147#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
148 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
149 : (U8*)(pos + off))
150#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
151
152#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
153#define NEXTCHR_IS_EOS (nextchr < 0)
154
155#define SET_nextchr \
220db18a 156 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
157
158#define SET_locinput(p) \
159 locinput = (p); \
160 SET_nextchr
161
162
2a16ac92 163#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
164 if (!swash_ptr) { \
165 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 166 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 167 1, 0, invlist, &flags); \
c7304fe2
KW
168 assert(swash_ptr); \
169 } \
170 } STMT_END
171
172/* If in debug mode, we test that a known character properly matches */
173#ifdef DEBUGGING
174# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
175 property_name, \
2a16ac92 176 invlist, \
c7304fe2 177 utf8_char_in_property) \
2a16ac92 178 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
179 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
180#else
181# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
182 property_name, \
2a16ac92 183 invlist, \
c7304fe2 184 utf8_char_in_property) \
2a16ac92 185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 186#endif
d1eb3177 187
c7304fe2
KW
188#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
189 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
190 "", \
191 PL_XPosix_ptrs[_CC_WORDCHAR], \
df38da56 192 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
c7304fe2
KW
193
194#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
195 STMT_START { \
196 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
197 "_X_regular_begin", \
2a16ac92 198 NULL, \
df38da56 199 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
c7304fe2
KW
200 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
201 "_X_extend", \
2a16ac92 202 NULL, \
c7304fe2
KW
203 COMBINING_GRAVE_ACCENT_UTF8); \
204 } STMT_END
d1eb3177 205
c7304fe2 206#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
207/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
208
5f80c4cf 209/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
210/* it would be nice to rework regcomp.sym to generate this stuff. sigh
211 *
212 * NOTE that *nothing* that affects backtracking should be in here, specifically
213 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
214 * node that is in between two EXACT like nodes when ascertaining what the required
215 * "follow" character is. This should probably be moved to regex compile time
216 * although it may be done at run time beause of the REF possibility - more
217 * investigation required. -- demerphq
218*/
baa60164
KW
219#define JUMPABLE(rn) ( \
220 OP(rn) == OPEN || \
3e901dc0 221 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
222 OP(rn) == EVAL || \
223 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
224 OP(rn) == PLUS || OP(rn) == MINMOD || \
225 OP(rn) == KEEPS || \
226 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 227)
ee9b8eae 228#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 229
ee9b8eae
YO
230#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
231
232#if 0
233/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
234 we don't need this definition. */
235#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 236#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
237#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
238
239#else
240/* ... so we use this as its faster. */
241#define IS_TEXT(rn) ( OP(rn)==EXACT )
098b07d5 242#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
243#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
244#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
245
246#endif
e2d8ce26 247
a84d97b6
HS
248/*
249 Search for mandatory following text node; for lookahead, the text must
250 follow but for lookbehind (rn->flags != 0) we skip to the next step.
251*/
baa60164 252#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
253 while (JUMPABLE(rn)) { \
254 const OPCODE type = OP(rn); \
255 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 256 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 257 else if (type == PLUS) \
cca55fe3 258 rn = NEXTOPER(rn); \
3dab1dad 259 else if (type == IFMATCH) \
a84d97b6 260 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 261 else rn += NEXT_OFF(rn); \
3dab1dad 262 } \
5f80c4cf 263} STMT_END
74750237 264
22913b96
KW
265/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
266 * These are for the pre-composed Hangul syllables, which are all in a
267 * contiguous block and arranged there in such a way so as to facilitate
268 * alorithmic determination of their characteristics. As such, they don't need
269 * a swash, but can be determined by simple arithmetic. Almost all are
270 * GCB=LVT, but every 28th one is a GCB=LV */
271#define SBASE 0xAC00 /* Start of block */
272#define SCount 11172 /* Length of block */
273#define TCount 28
c476f425 274
006f26b2
DM
275#define SLAB_FIRST(s) (&(s)->states[0])
276#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
277
a75351a1 278static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 279static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 280static regmatch_state * S_push_slab(pTHX);
51371543 281
87c0511b 282#define REGCP_PAREN_ELEMS 3
f067efbf 283#define REGCP_OTHER_ELEMS 3
e0fa7e2b 284#define REGCP_FRAME_ELEMS 1
620d5b66
NC
285/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
286 * are needed for the regexp context stack bookkeeping. */
287
76e3520e 288STATIC CHECKPOINT
92da3157 289S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 290{
97aff369 291 dVAR;
a3b680e6 292 const int retval = PL_savestack_ix;
92da3157
DM
293 const int paren_elems_to_push =
294 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
295 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
296 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 297 I32 p;
40a82448 298 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 299
b93070ed
DM
300 PERL_ARGS_ASSERT_REGCPPUSH;
301
e49a9654 302 if (paren_elems_to_push < 0)
d1b2014a
YO
303 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i",
304 paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS);
e49a9654 305
e0fa7e2b
NC
306 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
307 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 308 " out of range (%lu-%ld)",
92da3157
DM
309 total_elems,
310 (unsigned long)maxopenparen,
311 (long)parenfloor);
e0fa7e2b 312
620d5b66 313 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 314
495f47a5 315 DEBUG_BUFFERS_r(
92da3157 316 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
317 PerlIO_printf(Perl_debug_log,
318 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
319 PTR2UV(rex),
320 PTR2UV(rex->offs)
321 );
322 );
92da3157 323 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 324/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
325 SSPUSHIV(rex->offs[p].end);
326 SSPUSHIV(rex->offs[p].start);
1ca2007e 327 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 328 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
329 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
330 (UV)p,
331 (IV)rex->offs[p].start,
332 (IV)rex->offs[p].start_tmp,
333 (IV)rex->offs[p].end
40a82448 334 ));
a0d0e21e 335 }
b1ce53c5 336/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 337 SSPUSHINT(maxopenparen);
b93070ed
DM
338 SSPUSHINT(rex->lastparen);
339 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 340 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 341
a0d0e21e
LW
342 return retval;
343}
344
c277df42 345/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
346#define REGCP_SET(cp) \
347 DEBUG_STATE_r( \
ab3bbdeb 348 PerlIO_printf(Perl_debug_log, \
e4f74956 349 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
350 (IV)PL_savestack_ix)); \
351 cp = PL_savestack_ix
c3464db5 352
ab3bbdeb 353#define REGCP_UNWIND(cp) \
e4f74956 354 DEBUG_STATE_r( \
ab3bbdeb 355 if (cp != PL_savestack_ix) \
e4f74956
YO
356 PerlIO_printf(Perl_debug_log, \
357 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
358 (IV)(cp), (IV)PL_savestack_ix)); \
359 regcpblow(cp)
c277df42 360
a8d1f4b4
DM
361#define UNWIND_PAREN(lp, lcp) \
362 for (n = rex->lastparen; n > lp; n--) \
363 rex->offs[n].end = -1; \
364 rex->lastparen = n; \
365 rex->lastcloseparen = lcp;
366
367
f067efbf 368STATIC void
92da3157 369S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 370{
97aff369 371 dVAR;
e0fa7e2b 372 UV i;
87c0511b 373 U32 paren;
a3621e74
YO
374 GET_RE_DEBUG_FLAGS_DECL;
375
7918f24d
NC
376 PERL_ARGS_ASSERT_REGCPPOP;
377
b1ce53c5 378 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 379 i = SSPOPUV;
e0fa7e2b
NC
380 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
381 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
382 rex->lastcloseparen = SSPOPINT;
383 rex->lastparen = SSPOPINT;
92da3157 384 *maxopenparen_p = SSPOPINT;
b1ce53c5 385
620d5b66 386 i -= REGCP_OTHER_ELEMS;
b1ce53c5 387 /* Now restore the parentheses context. */
495f47a5
DM
388 DEBUG_BUFFERS_r(
389 if (i || rex->lastparen + 1 <= rex->nparens)
390 PerlIO_printf(Perl_debug_log,
391 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
392 PTR2UV(rex),
393 PTR2UV(rex->offs)
394 );
395 );
92da3157 396 paren = *maxopenparen_p;
620d5b66 397 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 398 SSize_t tmps;
1ca2007e 399 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
400 rex->offs[paren].start = SSPOPIV;
401 tmps = SSPOPIV;
b93070ed
DM
402 if (paren <= rex->lastparen)
403 rex->offs[paren].end = tmps;
495f47a5
DM
404 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
405 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
406 (UV)paren,
407 (IV)rex->offs[paren].start,
408 (IV)rex->offs[paren].start_tmp,
409 (IV)rex->offs[paren].end,
410 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 411 );
87c0511b 412 paren--;
a0d0e21e 413 }
daf18116 414#if 1
dafc8851
JH
415 /* It would seem that the similar code in regtry()
416 * already takes care of this, and in fact it is in
417 * a better location to since this code can #if 0-ed out
418 * but the code in regtry() is needed or otherwise tests
419 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
420 * (as of patchlevel 7877) will fail. Then again,
421 * this code seems to be necessary or otherwise
225593e1
DM
422 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
423 * --jhi updated by dapm */
b93070ed 424 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 425 if (i > *maxopenparen_p)
b93070ed
DM
426 rex->offs[i].start = -1;
427 rex->offs[i].end = -1;
495f47a5
DM
428 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
429 " \\%"UVuf": %s ..-1 undeffing\n",
430 (UV)i,
92da3157 431 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 432 ));
a0d0e21e 433 }
dafc8851 434#endif
a0d0e21e
LW
435}
436
74088413
DM
437/* restore the parens and associated vars at savestack position ix,
438 * but without popping the stack */
439
440STATIC void
92da3157 441S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
442{
443 I32 tmpix = PL_savestack_ix;
444 PL_savestack_ix = ix;
92da3157 445 regcppop(rex, maxopenparen_p);
74088413
DM
446 PL_savestack_ix = tmpix;
447}
448
02db2b7b 449#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 450
31c7f561
KW
451STATIC bool
452S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
453{
454 /* Returns a boolean as to whether or not 'character' is a member of the
455 * Posix character class given by 'classnum' that should be equivalent to a
456 * value in the typedef '_char_class_number'.
457 *
458 * Ideally this could be replaced by a just an array of function pointers
459 * to the C library functions that implement the macros this calls.
460 * However, to compile, the precise function signatures are required, and
461 * these may vary from platform to to platform. To avoid having to figure
462 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
463 * which adds an extra layer of function call overhead (unless the C
464 * optimizer strips it away). But we don't particularly care about
465 * performance with locales anyway. */
31c7f561
KW
466
467 switch ((_char_class_number) classnum) {
15861f94 468 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 469 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
470 case _CC_ENUM_ASCII: return isASCII_LC(character);
471 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
472 case _CC_ENUM_CASED: return isLOWER_LC(character)
473 || isUPPER_LC(character);
e8d596e0 474 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
475 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
476 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
477 case _CC_ENUM_LOWER: return isLOWER_LC(character);
478 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 479 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 480 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 481 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
482 case _CC_ENUM_UPPER: return isUPPER_LC(character);
483 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 484 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
485 default: /* VERTSPACE should never occur in locales */
486 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
487 }
488
489 assert(0); /* NOTREACHED */
490 return FALSE;
491}
492
3018b823
KW
493STATIC bool
494S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
495{
496 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
497 * 'character' is a member of the Posix character class given by 'classnum'
498 * that should be equivalent to a value in the typedef
499 * '_char_class_number'.
500 *
501 * This just calls isFOO_lc on the code point for the character if it is in
502 * the range 0-255. Outside that range, all characters avoid Unicode
503 * rules, ignoring any locale. So use the Unicode function if this class
504 * requires a swash, and use the Unicode macro otherwise. */
505
506 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
507
508 if (UTF8_IS_INVARIANT(*character)) {
509 return isFOO_lc(classnum, *character);
510 }
511 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
512 return isFOO_lc(classnum,
94bb8c36 513 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
514 }
515
516 if (classnum < _FIRST_NON_SWASH_CC) {
517
518 /* Initialize the swash unless done already */
519 if (! PL_utf8_swash_ptrs[classnum]) {
520 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
521 PL_utf8_swash_ptrs[classnum] =
522 _core_swash_init("utf8",
523 "",
524 &PL_sv_undef, 1, 0,
525 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
526 }
527
92a2046b
KW
528 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
529 character,
530 TRUE /* is UTF */ ));
3018b823
KW
531 }
532
533 switch ((_char_class_number) classnum) {
534 case _CC_ENUM_SPACE:
535 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
536
537 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
538 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
539 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
540 default: return 0; /* Things like CNTRL are always
541 below 256 */
542 }
543
544 assert(0); /* NOTREACHED */
545 return FALSE;
546}
547
a687059c 548/*
e50aee73 549 * pregexec and friends
a687059c
LW
550 */
551
76234dfb 552#ifndef PERL_IN_XSUB_RE
a687059c 553/*
c277df42 554 - pregexec - match a regexp against a string
a687059c 555 */
c277df42 556I32
5aaab254 557Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 558 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
559/* stringarg: the point in the string at which to begin matching */
560/* strend: pointer to null at end of string */
561/* strbeg: real beginning of string */
562/* minend: end of match must be >= minend bytes after stringarg. */
563/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
564 * itself is accessed via the pointers above */
565/* nosave: For optimizations. */
c277df42 566{
7918f24d
NC
567 PERL_ARGS_ASSERT_PREGEXEC;
568
c277df42 569 return
9041c2e3 570 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
571 nosave ? 0 : REXEC_COPY_STR);
572}
76234dfb 573#endif
22e551b9 574
cad2e5aa 575
6eb5f6b9 576
1a4edc3c
DM
577/* re_intuit_start():
578 *
579 * Based on some optimiser hints, try to find the earliest position in the
580 * string where the regex could match.
581 *
582 * rx: the regex to match against
583 * sv: the SV being matched: only used for utf8 flag; the string
584 * itself is accessed via the pointers below. Note that on
585 * something like an overloaded SV, SvPOK(sv) may be false
586 * and the string pointers may point to something unrelated to
587 * the SV itself.
588 * strbeg: real beginning of string
589 * strpos: the point in the string at which to begin matching
590 * strend: pointer to the byte following the last char of the string
591 * flags currently unused; set to 0
592 * data: currently unused; set to NULL
593 *
594 * The basic idea of re_intuit_start() is to use some known information
595 * about the pattern, namely:
596 *
597 * a) the longest known anchored substring (i.e. one that's at a
598 * constant offset from the beginning of the pattern; but not
599 * necessarily at a fixed offset from the beginning of the
600 * string);
601 * b) the longest floating substring (i.e. one that's not at a constant
602 * offset from the beginning of the pattern);
603 * c) Whether the pattern is anchored to the string; either
604 * an absolute anchor: /^../, or anchored to \n: /^.../m,
605 * or anchored to pos(): /\G/;
606 * d) A start class: a real or synthetic character class which
607 * represents which characters are legal at the start of the pattern;
608 *
609 * to either quickly reject the match, or to find the earliest position
610 * within the string at which the pattern might match, thus avoiding
611 * running the full NFA engine at those earlier locations, only to
612 * eventually fail and retry further along.
613 *
614 * Returns NULL if the pattern can't match, or returns the address within
615 * the string which is the earliest place the match could occur.
616 *
617 * The longest of the anchored and floating substrings is called 'check'
618 * and is checked first. The other is called 'other' and is checked
619 * second. The 'other' substring may not be present. For example,
620 *
621 * /(abc|xyz)ABC\d{0,3}DEFG/
622 *
623 * will have
624 *
625 * check substr (float) = "DEFG", offset 6..9 chars
626 * other substr (anchored) = "ABC", offset 3..3 chars
627 * stclass = [ax]
628 *
629 * Be aware that during the course of this function, sometimes 'anchored'
630 * refers to a substring being anchored relative to the start of the
631 * pattern, and sometimes to the pattern itself being anchored relative to
632 * the string. For example:
633 *
634 * /\dabc/: "abc" is anchored to the pattern;
635 * /^\dabc/: "abc" is anchored to the pattern and the string;
636 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
637 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
638 * but the pattern is anchored to the string.
52a21eb3
DM
639 */
640
cad2e5aa 641char *
52a21eb3
DM
642Perl_re_intuit_start(pTHX_
643 REGEXP * const rx,
644 SV *sv,
645 const char * const strbeg,
646 char *strpos,
647 char *strend,
648 const U32 flags,
649 re_scream_pos_data *data)
cad2e5aa 650{
97aff369 651 dVAR;
8d919b0a 652 struct regexp *const prog = ReANY(rx);
6b071d16 653 SSize_t start_shift = prog->check_offset_min;
cad2e5aa 654 /* Should be nonnegative! */
ea3daa5d 655 SSize_t end_shift = 0;
0fc004dd
DM
656 /* current lowest pos in string where the regex can start matching */
657 char *rx_origin = strpos;
eb578fdb 658 SV *check;
f2ed9b32 659 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 660 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 661 bool ml_anch = 0;
8f4bf5fc 662 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 663 char *check_at = NULL; /* check substr found at this pos */
bbe252da 664 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 665 RXi_GET_DECL(prog,progi);
02d5137b
DM
666 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
667 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
668 GET_RE_DEBUG_FLAGS_DECL;
669
7918f24d 670 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
671 PERL_UNUSED_ARG(flags);
672 PERL_UNUSED_ARG(data);
7918f24d 673
1dc475d0
DM
674 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675 "Intuit: trying to determine minimum start position...\n"));
676
fb9bbddb
DM
677 /* for now, assume that all substr offsets are positive. If at some point
678 * in the future someone wants to do clever things with look-behind and
679 * -ve offsets, they'll need to fix up any code in this function
680 * which uses these offsets. See the thread beginning
681 * <20140113145929.GF27210@iabyn.com>
682 */
683 assert(prog->substrs->data[0].min_offset >= 0);
684 assert(prog->substrs->data[0].max_offset >= 0);
685 assert(prog->substrs->data[1].min_offset >= 0);
686 assert(prog->substrs->data[1].max_offset >= 0);
687 assert(prog->substrs->data[2].min_offset >= 0);
688 assert(prog->substrs->data[2].max_offset >= 0);
689
f7022b5a 690 /* for now, assume that if both present, that the floating substring
83f2232d 691 * doesn't start before the anchored substring.
f7022b5a
DM
692 * If you break this assumption (e.g. doing better optimisations
693 * with lookahead/behind), then you'll need to audit the code in this
694 * function carefully first
695 */
696 assert(
697 ! ( (prog->anchored_utf8 || prog->anchored_substr)
698 && (prog->float_utf8 || prog->float_substr))
699 || (prog->float_min_offset >= prog->anchored_offset));
700
1a4edc3c
DM
701 /* byte rather than char calculation for efficiency. It fails
702 * to quickly reject some cases that can't match, but will reject
703 * them later after doing full char arithmetic */
c344f387 704 if (prog->minlen > strend - strpos) {
a3621e74 705 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 706 " String too short...\n"));
cad2e5aa 707 goto fail;
2c2d71f5 708 }
d8da0584 709
6c3fea77 710 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 711 reginfo->info_aux = NULL;
9d9163fb 712 reginfo->strbeg = strbeg;
220db18a 713 reginfo->strend = strend;
aed7b151 714 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 715 reginfo->intuit = 1;
1cb48e53
DM
716 /* not actually used within intuit, but zero for safety anyway */
717 reginfo->poscache_maxiter = 0;
02d5137b 718
f2ed9b32 719 if (utf8_target) {
33b8afdf
JH
720 if (!prog->check_utf8 && prog->check_substr)
721 to_utf8_substr(prog);
722 check = prog->check_utf8;
723 } else {
7e0d5ad7
KW
724 if (!prog->check_substr && prog->check_utf8) {
725 if (! to_byte_substr(prog)) {
6b54ddc5 726 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
727 }
728 }
33b8afdf
JH
729 check = prog->check_substr;
730 }
274cd312 731
1dc475d0
DM
732 /* dump the various substring data */
733 DEBUG_OPTIMISE_MORE_r({
734 int i;
735 for (i=0; i<=2; i++) {
736 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
737 : prog->substrs->data[i].substr);
738 if (!sv)
739 continue;
740
741 PerlIO_printf(Perl_debug_log,
742 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
743 " useful=%"IVdf" utf8=%d [%s]\n",
744 i,
745 (IV)prog->substrs->data[i].min_offset,
746 (IV)prog->substrs->data[i].max_offset,
747 (IV)prog->substrs->data[i].end_shift,
748 BmUSEFUL(sv),
749 utf8_target ? 1 : 0,
750 SvPEEK(sv));
751 }
752 });
753
8e1490ee 754 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
755
756 /* ml_anch: check after \n?
757 *
758 * A note about IMPLICIT: on an un-anchored pattern beginning
759 * with /.*.../, these flags will have been added by the
760 * compiler:
761 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
762 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
763 */
7d2d37f5
DM
764 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
765 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 766
343c8a29 767 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
768 /* we are only allowed to match at BOS or \G */
769
57fcbfa7 770 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
771 *
772 * Note that we don't try to do a similar quick reject for
773 * \G, since generally the caller will have calculated strpos
774 * based on pos() and gofs, so the string is already correctly
775 * anchored by definition; and handling the exceptions would
776 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 777 */
7bb3b9eb 778 if ( strpos != strbeg
343c8a29 779 && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL)))
c889ccc8
DM
780 {
781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 782 " Not at start...\n"));
c889ccc8
DM
783 goto fail;
784 }
785
a5d12a4b
DM
786 /* in the presence of an anchor, the anchored (relative to the
787 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
788 * to strpos. So quickly reject if substr isn't found there.
789 * This works for \G too, because the caller will already have
790 * subtracted gofs from pos, and gofs is the offset from the
791 * \G to the start of the regex. For example, in /.abc\Gdef/,
792 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
793 * caller will have set strpos=pos()-4; we look for the substr
794 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 795
c889ccc8 796 if (prog->check_offset_min == prog->check_offset_max
d0d44648 797 && !(prog->intflags & PREGf_CANY_SEEN))
c889ccc8
DM
798 {
799 /* Substring at constant offset from beg-of-str... */
d307bf57 800 SSize_t slen = SvCUR(check);
343c8a29 801 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 802
1dc475d0
DM
803 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
804 " Looking for check substr at fixed offset %"IVdf"...\n",
805 (IV)prog->check_offset_min));
806
7742aa66
DM
807 if (SvTAIL(check)) {
808 /* In this case, the regex is anchored at the end too.
809 * Unless it's a multiline match, the lengths must match
810 * exactly, give or take a \n. NB: slen >= 1 since
811 * the last char of check is \n */
812 if (!multiline
813 && ( strend - s > slen
814 || strend - s < slen - 1
815 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8
DM
816 {
817 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 818 " String too long...\n"));
c889ccc8
DM
819 goto fail_finish;
820 }
821 /* Now should match s[0..slen-2] */
822 slen--;
c889ccc8 823 }
d307bf57
DM
824 if (slen && (*SvPVX_const(check) != *s
825 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
826 {
827 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 828 " String not equal...\n"));
d307bf57
DM
829 goto fail_finish;
830 }
c889ccc8
DM
831
832 check_at = s;
833 goto success_at_start;
cad2e5aa 834 }
cad2e5aa 835 }
cad2e5aa 836 }
0fc004dd 837
c0e0ec46 838 end_shift = prog->check_end_shift;
cad2e5aa 839
19188028 840#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 841 if (end_shift < 0)
1de06328 842 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 843 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
844#endif
845
2c2d71f5 846 restart:
1de06328 847
66b7ec5c
DM
848 /* This is the (re)entry point of the main loop in this function.
849 * The goal of this loop is to:
850 * 1) find the "check" substring in the region rx_origin..strend
851 * (adjusted by start_shift / end_shift). If not found, reject
852 * immediately.
853 * 2) If it exists, look for the "other" substr too if defined; for
854 * example, if the check substr maps to the anchored substr, then
855 * check the floating substr, and vice-versa. If not found, go
856 * back to (1) with rx_origin suitably incremented.
857 * 3) If we find an rx_origin position that doesn't contradict
858 * either of the substrings, then check the possible additional
859 * constraints on rx_origin of /^.../m or a known start class.
860 * If these fail, then depending on which constraints fail, jump
861 * back to here, or to various other re-entry points further along
862 * that skip some of the first steps.
863 * 4) If we pass all those tests, update the BmUSEFUL() count on the
864 * substring. If the start position was determined to be at the
865 * beginning of the string - so, not rejected, but not optimised,
866 * since we have to run regmatch from position 0 - decrement the
867 * BmUSEFUL() count. Otherwise increment it.
868 */
869
1a4edc3c
DM
870
871 /* first, look for the 'check' substring */
872
1de06328 873 {
c33e64f0
FC
874 U8* start_point;
875 U8* end_point;
c889ccc8 876
c889ccc8 877 DEBUG_OPTIMISE_MORE_r({
1dc475d0 878 PerlIO_printf(Perl_debug_log,
ae5d4331 879 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 880 " Start shift: %"IVdf" End shift %"IVdf
4d281893 881 " Real end Shift: %"IVdf"\n",
197641b6 882 (IV)(rx_origin - strpos),
c889ccc8 883 (IV)prog->check_offset_min,
12fbc530
DM
884 (IV)start_shift,
885 (IV)end_shift,
c889ccc8
DM
886 (IV)prog->check_end_shift);
887 });
1de06328 888
0d331aaf 889 if (prog->intflags & PREGf_CANY_SEEN) {
0fc004dd 890 start_point= (U8*)(rx_origin + start_shift);
12fbc530 891 end_point= (U8*)(strend - end_shift);
557f47af
DM
892 if (start_point > end_point)
893 goto fail_finish;
1de06328 894 } else {
557f47af
DM
895 end_point = HOP3(strend, -end_shift, strbeg);
896 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
897 if (!start_point)
898 goto fail_finish;
1de06328 899 }
c889ccc8 900
557f47af 901
e0362b86
DM
902 /* If the regex is absolutely anchored to either the start of the
903 * string (BOL,SBOL) or to pos() (ANCH_GPOS), then
904 * check_offset_max represents an upper bound on the string where
905 * the substr could start. For the ANCH_GPOS case, we assume that
906 * the caller of intuit will have already set strpos to
907 * pos()-gofs, so in this case strpos + offset_max will still be
908 * an upper bound on the substr.
909 */
c19c836a
DM
910 if (!ml_anch
911 && prog->intflags & PREGf_ANCH
e0362b86 912 && prog->check_offset_max != SSize_t_MAX)
c19c836a 913 {
1a08ba3a 914 SSize_t len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
915 const char * const anchor =
916 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
917
918 /* do a bytes rather than chars comparison. It's conservative;
919 * so it skips doing the HOP if the result can't possibly end
920 * up earlier than the old value of end_point.
921 */
922 if ((char*)end_point - anchor > prog->check_offset_max) {
923 end_point = HOP3lim((U8*)anchor,
924 prog->check_offset_max,
925 end_point -len)
926 + len;
927 }
d6ef1678
DM
928 }
929
6bda09f9 930 DEBUG_OPTIMISE_MORE_r({
1dc475d0 931 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
1de06328 932 (int)(end_point - start_point),
fc8cd66c 933 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
934 start_point);
935 });
936
ae5d4331 937 check_at = fbm_instr( start_point, end_point,
7fba1cd6 938 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 939
8fd34720
DM
940 /* Update the count-of-usability, remove useless subpatterns,
941 unshift s. */
942
943 DEBUG_EXECUTE_r({
944 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
945 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
946 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
947 (check_at ? "Found" : "Did not find"),
948 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
949 ? "anchored" : "floating"),
950 quoted,
951 RE_SV_TAIL(check),
952 (check_at ? " at offset " : "...\n") );
953 });
2c2d71f5 954
8fd34720
DM
955 if (!check_at)
956 goto fail_finish;
957 /* Finish the diagnostic message */
958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
2c2d71f5 959
8fd34720
DM
960 /* set rx_origin to the minimum position where the regex could start
961 * matching, given the constraint of the just-matched check substring.
962 * But don't set it lower than previously.
963 */
fdc003fd 964
8fd34720
DM
965 if (check_at - rx_origin > prog->check_offset_max)
966 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
967 }
fdc003fd
DM
968
969
1a4edc3c 970 /* now look for the 'other' substring if defined */
2c2d71f5 971
6480a6c4
DM
972 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
973 : prog->substrs->data[other_ix].substr)
1de06328 974 {
30944b6d 975 /* Take into account the "other" substring. */
6c3343a6
DM
976 char *last, *last1;
977 char *s;
978 SV* must;
979 struct reg_substr_datum *other;
980
981 do_other_substr:
982 other = &prog->substrs->data[other_ix];
983
984 /* if "other" is anchored:
985 * we've previously found a floating substr starting at check_at.
986 * This means that the regex origin must lie somewhere
987 * between min (rx_origin): HOP3(check_at, -check_offset_max)
988 * and max: HOP3(check_at, -check_offset_min)
989 * (except that min will be >= strpos)
990 * So the fixed substr must lie somewhere between
991 * HOP3(min, anchored_offset)
992 * HOP3(max, anchored_offset) + SvCUR(substr)
993 */
994
995 /* if "other" is floating
996 * Calculate last1, the absolute latest point where the
997 * floating substr could start in the string, ignoring any
998 * constraints from the earlier fixed match. It is calculated
999 * as follows:
1000 *
1001 * strend - prog->minlen (in chars) is the absolute latest
1002 * position within the string where the origin of the regex
1003 * could appear. The latest start point for the floating
1004 * substr is float_min_offset(*) on from the start of the
1005 * regex. last1 simply combines thee two offsets.
1006 *
1007 * (*) You might think the latest start point should be
1008 * float_max_offset from the regex origin, and technically
1009 * you'd be correct. However, consider
1010 * /a\d{2,4}bcd\w/
1011 * Here, float min, max are 3,5 and minlen is 7.
1012 * This can match either
1013 * /a\d\dbcd\w/
1014 * /a\d\d\dbcd\w/
1015 * /a\d\d\d\dbcd\w/
1016 * In the first case, the regex matches minlen chars; in the
1017 * second, minlen+1, in the third, minlen+2.
1018 * In the first case, the floating offset is 3 (which equals
1019 * float_min), in the second, 4, and in the third, 5 (which
1020 * equals float_max). In all cases, the floating string bcd
1021 * can never start more than 4 chars from the end of the
1022 * string, which equals minlen - float_min. As the substring
1023 * starts to match more than float_min from the start of the
1024 * regex, it makes the regex match more than minlen chars,
1025 * and the two cancel each other out. So we can always use
1026 * float_min - minlen, rather than float_max - minlen for the
1027 * latest position in the string.
1028 *
1029 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1030 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1031 */
1032
e7a14a9c 1033 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1034 last1 = HOP3c(strend,
1035 other->min_offset - prog->minlen, strbeg);
1036
4d006249 1037 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1038 /* last is the latest point where the floating substr could
1039 * start, *given* any constraints from the earlier fixed
1040 * match. This constraint is that the floating string starts
1041 * <= float_max_offset chars from the regex origin (rx_origin).
1042 * If this value is less than last1, use it instead.
eb3831ce 1043 */
6c3343a6
DM
1044 assert(rx_origin <= last1);
1045 last =
1046 /* this condition handles the offset==infinity case, and
1047 * is a short-cut otherwise. Although it's comparing a
1048 * byte offset to a char length, it does so in a safe way,
1049 * since 1 char always occupies 1 or more bytes,
1050 * so if a string range is (last1 - rx_origin) bytes,
1051 * it will be less than or equal to (last1 - rx_origin)
1052 * chars; meaning it errs towards doing the accurate HOP3
1053 * rather than just using last1 as a short-cut */
1054 (last1 - rx_origin) < other->max_offset
1055 ? last1
1056 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1057 }
1058 else {
1059 assert(strpos + start_shift <= check_at);
1060 last = HOP4c(check_at, other->min_offset - start_shift,
1061 strbeg, strend);
1062 }
ead917d0 1063
6c3343a6
DM
1064 s = HOP3c(rx_origin, other->min_offset, strend);
1065 if (s < other_last) /* These positions already checked */
1066 s = other_last;
1067
1068 must = utf8_target ? other->utf8_substr : other->substr;
1069 assert(SvPOK(must));
1070 s = fbm_instr(
1071 (unsigned char*)s,
1072 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1073 must,
1074 multiline ? FBMrf_MULTILINE : 0
1075 );
1076 DEBUG_EXECUTE_r({
1077 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1078 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1079 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1080 s ? "Found" : "Contradicts",
1081 other_ix ? "floating" : "anchored",
1082 quoted, RE_SV_TAIL(must));
1083 });
ead917d0 1084
ead917d0 1085
6c3343a6
DM
1086 if (!s) {
1087 /* last1 is latest possible substr location. If we didn't
1088 * find it before there, we never will */
1089 if (last >= last1) {
1090 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1091 ", giving up...\n"));
1092 goto fail_finish;
ead917d0
DM
1093 }
1094
6c3343a6
DM
1095 /* try to find the check substr again at a later
1096 * position. Maybe next time we'll find the "other" substr
1097 * in range too */
1098 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1099 ", trying %s at offset %ld...\n",
1100 (other_ix ? "floating" : "anchored"),
197641b6 1101 (long)(HOP3c(check_at, 1, strend) - strpos)));
6c3343a6
DM
1102
1103 other_last = HOP3c(last, 1, strend) /* highest failure */;
1104 rx_origin =
4d006249 1105 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1106 ? HOP3c(rx_origin, 1, strend)
1107 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1108 goto restart;
1109 }
1110 else {
1111 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
197641b6 1112 (long)(s - strpos)));
ead917d0 1113
4d006249 1114 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1115 /* other_last is set to s, not s+1, since its possible for
1116 * a floating substr to fail first time, then succeed
1117 * second time at the same floating position; e.g.:
1118 * "-AB--AABZ" =~ /\wAB\d*Z/
1119 * The first time round, anchored and float match at
1120 * "-(AB)--AAB(Z)" then fail on the initial \w character
1121 * class. Second time round, they match at "-AB--A(AB)(Z)".
1122 */
1123 other_last = s;
ead917d0
DM
1124 }
1125 else {
6c3343a6
DM
1126 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1127 other_last = HOP3c(s, 1, strend);
ead917d0 1128 }
6c3343a6 1129 }
cad2e5aa 1130 }
acba93e8
DM
1131 else {
1132 DEBUG_OPTIMISE_MORE_r(
1133 PerlIO_printf(Perl_debug_log,
1134 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1135 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
acba93e8
DM
1136 " strend-strpos:%"IVdf"\n",
1137 (IV)prog->check_offset_min,
1138 (IV)prog->check_offset_max,
1c1c599d 1139 (IV)(check_at-strpos),
acba93e8 1140 (IV)(rx_origin-strpos),
1c1c599d 1141 (IV)(rx_origin-check_at),
acba93e8
DM
1142 (IV)(strend-strpos)
1143 )
1144 );
1145 }
2c2d71f5 1146
acba93e8 1147 postprocess_substr_matches:
0991020e 1148
1a4edc3c 1149 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1150
7d2d37f5 1151 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1152 char *s;
1153
a62659bd
DM
1154 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1155 " looking for /^/m anchor"));
d0880ea7
DM
1156
1157 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1158 * Find the next \n, if any, even if it's beyond the current
1159 * anchored and/or floating substrings. Whether we should be
1160 * scanning ahead for the next \n or the next substr is debatable.
1161 * On the one hand you'd expect rare substrings to appear less
1162 * often than \n's. On the other hand, searching for \n means
1163 * we're effectively flipping been check_substr and "\n" on each
1164 * iteration as the current "rarest" string candidate, which
1165 * means for example that we'll quickly reject the whole string if
1166 * hasn't got a \n, rather than trying every substr position
1167 * first
1168 */
d0880ea7 1169
4620cb61
DM
1170 s = HOP3c(strend, - prog->minlen, strpos);
1171 if (s <= rx_origin ||
1172 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1173 {
d0880ea7
DM
1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1175 " Did not find /%s^%s/m...\n",
1176 PL_colors[0], PL_colors[1]));
a62659bd
DM
1177 goto fail_finish;
1178 }
d0880ea7 1179
4ada1233
DM
1180 /* earliest possible origin is 1 char after the \n.
1181 * (since *rx_origin == '\n', it's safe to ++ here rather than
1182 * HOP(rx_origin, 1)) */
1183 rx_origin++;
d0880ea7 1184
f4f115de 1185 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1186 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1187 {
d0880ea7
DM
1188 /* Position contradicts check-string; either because
1189 * check was anchored (and thus has no wiggle room),
4ada1233 1190 * or check was float and rx_origin is above the float range */
d0880ea7
DM
1191 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1192 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
4ada1233 1193 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
d0880ea7
DM
1194 goto restart;
1195 }
1196
1197 /* if we get here, the check substr must have been float,
2e759faa 1198 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1199 * "other" substr which still contradicts */
1200 assert(prog->substrs->check_ix); /* check is float */
1201
1202 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1203 /* whoops, the anchored "other" substr exists, so we still
1204 * contradict. On the other hand, the float "check" substr
1205 * didn't contradict, so just retry the anchored "other"
1206 * substr */
1207 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1208 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1209 PL_colors[0], PL_colors[1],
1210 (long)(rx_origin - strpos),
1211 (long)(rx_origin - strpos + prog->anchored_offset)));
1212 goto do_other_substr;
1213 }
1214
1215 /* success: we don't contradict the found floating substring
1216 * (and there's no anchored substr). */
d0880ea7
DM
1217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1218 " Found /%s^%s/m at offset %ld...\n",
1219 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
e3c6feb0
DM
1220 }
1221 else {
2e759faa 1222 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
fe4f3442 1223 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1224 }
1225
ffad1e6a 1226 success_at_start:
e3c6feb0 1227
cad2e5aa 1228
dd170ff5
DM
1229 /* if we have a starting character class, then test that extra constraint.
1230 * (trie stclasses are too expensive to use here, we are better off to
1231 * leave it to regmatch itself) */
1232
f8fc2ecf 1233 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1234 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1235
2c75e362 1236 /* XXX this value could be pre-computed */
f8fc2ecf 1237 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1238 ? (reginfo->is_utf8_pat
1239 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1240 : STR_LEN(progi->regstclass))
66e933ab 1241 : 1);
1de06328 1242 char * endpos;
fa3bb21d 1243 char *s;
000dfd2d
DM
1244 /* latest pos that a matching float substr constrains rx start to */
1245 char *rx_max_float = NULL;
1246
c75a3985
DM
1247 /* if the current rx_origin is anchored, either by satisfying an
1248 * anchored substring constraint, or a /^.../m constraint, then we
1249 * can reject the current origin if the start class isn't found
1250 * at the current position. If we have a float-only match, then
1251 * rx_origin is constrained to a range; so look for the start class
1252 * in that range. if neither, then look for the start class in the
1253 * whole rest of the string */
1254
dd170ff5
DM
1255 /* XXX DAPM it's not clear what the minlen test is for, and why
1256 * it's not used in the floating case. Nothing in the test suite
1257 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1258 * Here are some old comments, which may or may not be correct:
1259 *
1260 * minlen == 0 is possible if regstclass is \b or \B,
1261 * and the fixed substr is ''$.
1262 * Since minlen is already taken into account, rx_origin+1 is
1263 * before strend; accidentally, minlen >= 1 guaranties no false
1264 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1265 * 0) below assumes that regstclass does not come from lookahead...
1266 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1267 * This leaves EXACTF-ish only, which are dealt with in
1268 * find_byclass().
1269 */
1270
7d2d37f5 1271 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
fa3bb21d 1272 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d
DM
1273 else if (prog->float_substr || prog->float_utf8) {
1274 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1275 endpos= HOP3c(rx_max_float, cl_l, strend);
1276 }
1de06328
YO
1277 else
1278 endpos= strend;
1279
1dc475d0
DM
1280 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1281 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
c43b5520 1282 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1dc475d0 1283 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1284 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1285
c43b5520 1286 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1287 reginfo);
be778b1a 1288 if (!s) {
6eb5f6b9 1289 if (endpos == strend) {
a3621e74 1290 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1291 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1292 goto fail;
1293 }
a3621e74 1294 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1295 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1296 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1297 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1298 goto fail;
9fed8d02 1299
6eb5f6b9 1300 /* Contradict one of substrings */
97136c8a
DM
1301 if (prog->anchored_substr || prog->anchored_utf8) {
1302 if (prog->substrs->check_ix == 1) { /* check is float */
1303 /* Have both, check_string is floating */
1304 assert(rx_origin + start_shift <= check_at);
1305 if (rx_origin + start_shift != check_at) {
1306 /* not at latest position float substr could match:
c75a3985
DM
1307 * Recheck anchored substring, but not floating.
1308 * The condition above is in bytes rather than
1309 * chars for efficiency. It's conservative, in
1310 * that it errs on the side of doing 'goto
1311 * do_other_substr', where a more accurate
1312 * char-based calculation will be done */
97136c8a
DM
1313 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1314 " Looking for anchored substr starting at offset %ld...\n",
1315 (long)(other_last - strpos)) );
1316 goto do_other_substr;
3369914b 1317 }
3369914b
DM
1318 }
1319 }
97136c8a 1320 else {
9fed8d02
DM
1321 /* float-only */
1322
7d2d37f5 1323 if (ml_anch) {
c75a3985
DM
1324 /* In the presence of ml_anch, we might be able to
1325 * find another \n without breaking the current float
1326 * constraint. */
1327
1328 /* strictly speaking this should be HOP3c(..., 1, ...),
1329 * but since we goto a block of code that's going to
1330 * search for the next \n if any, its safe here */
9fed8d02 1331 rx_origin++;
9fed8d02
DM
1332 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1333 " Looking for /%s^%s/m starting at offset %ld...\n",
1334 PL_colors[0], PL_colors[1],
1335 (long)(rx_origin - strpos)) );
9fed8d02 1336 goto postprocess_substr_matches;
ab60c45a 1337 }
c75a3985
DM
1338
1339 /* strictly speaking this can never be true; but might
1340 * be if we ever allow intuit without substrings */
1341 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1342 goto fail;
c75a3985 1343
000dfd2d 1344 rx_origin = rx_max_float;
9fed8d02
DM
1345 }
1346
c75a3985
DM
1347 /* at this point, any matching substrings have been
1348 * contradicted. Start again... */
1349
9fed8d02 1350 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1351
1352 /* uses bytes rather than char calculations for efficiency.
1353 * It's conservative: it errs on the side of doing 'goto restart',
1354 * where there is code that does a proper char-based test */
9fed8d02 1355 if (rx_origin + start_shift + end_shift > strend) {
40268e5b 1356 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
9fed8d02
DM
1357 " Could not match STCLASS...\n") );
1358 goto fail;
1359 }
1360 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1361 " Looking for %s substr starting at offset %ld...\n",
1362 (prog->substrs->check_ix ? "floating" : "anchored"),
1363 (long)(rx_origin + start_shift - strpos)) );
1364 goto restart;
6eb5f6b9 1365 }
9fed8d02 1366
c75a3985
DM
1367 /* Success !!! */
1368
5f9c6575 1369 if (rx_origin != s) {
a3621e74 1370 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1371 " By STCLASS: moving %ld --> %ld\n",
5f9c6575 1372 (long)(rx_origin - strpos), (long)(s - strpos))
b7953727
JH
1373 );
1374 }
1375 else {
a3621e74 1376 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1377 " Does not contradict STCLASS...\n");
b7953727
JH
1378 );
1379 }
6eb5f6b9 1380 }
ffad1e6a
DM
1381
1382 /* Decide whether using the substrings helped */
1383
1384 if (rx_origin != strpos) {
1385 /* Fixed substring is found far enough so that the match
1386 cannot start at strpos. */
1387
1388 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1389 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1390 }
1391 else {
70563e16
DM
1392 /* The found rx_origin position does not prohibit matching at
1393 * strpos, so calling intuit didn't gain us anything. Decrement
1394 * the BmUSEFUL() count on the check substring, and if we reach
1395 * zero, free it. */
1396 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1397 && (utf8_target ? (
1398 prog->check_utf8 /* Could be deleted already */
1399 && --BmUSEFUL(prog->check_utf8) < 0
1400 && (prog->check_utf8 == prog->float_utf8)
1401 ) : (
1402 prog->check_substr /* Could be deleted already */
1403 && --BmUSEFUL(prog->check_substr) < 0
1404 && (prog->check_substr == prog->float_substr)
1405 )))
1406 {
1407 /* If flags & SOMETHING - do not do it many times on the same match */
1408 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1409 /* XXX Does the destruction order has to change with utf8_target? */
1410 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1411 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1412 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1413 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1414 check = NULL; /* abort */
ffad1e6a
DM
1415 /* XXXX This is a remnant of the old implementation. It
1416 looks wasteful, since now INTUIT can use many
1417 other heuristics. */
1418 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1419 }
1420 }
1421
1422 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1423 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1424 PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1425
c765d6e0 1426 return rx_origin;
2c2d71f5
JH
1427
1428 fail_finish: /* Substring not found */
33b8afdf 1429 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1430 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1431 fail:
a3621e74 1432 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1433 PL_colors[4], PL_colors[5]));
bd61b366 1434 return NULL;
cad2e5aa 1435}
9661b544 1436
70563e16 1437
a0a388a1 1438#define DECL_TRIE_TYPE(scan) \
098b07d5
KW
1439 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1440 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
fab2782b
YO
1441 trie_type = ((scan->flags == EXACT) \
1442 ? (utf8_target ? trie_utf8 : trie_plain) \
098b07d5
KW
1443 : (scan->flags == EXACTFA) \
1444 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1445 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
fab2782b 1446
fd3249ee 1447#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1448STMT_START { \
fab2782b 1449 STRLEN skiplen; \
baa60164 1450 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1451 switch (trie_type) { \
31f05a37 1452 case trie_utf8_exactfa_fold: \
baa60164 1453 flags |= FOLD_FLAGS_NOMIX_ASCII; \
924ba076 1454 /* FALLTHROUGH */ \
fab2782b
YO
1455 case trie_utf8_fold: \
1456 if ( foldlen>0 ) { \
c80e42f3 1457 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1458 foldlen -= len; \
1459 uscan += len; \
1460 len=0; \
1461 } else { \
445bf929 1462 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1463 len = UTF8SKIP(uc); \
1464 skiplen = UNISKIP( uvc ); \
1465 foldlen -= skiplen; \
1466 uscan = foldbuf + skiplen; \
1467 } \
1468 break; \
baa60164
KW
1469 case trie_latin_utf8_exactfa_fold: \
1470 flags |= FOLD_FLAGS_NOMIX_ASCII; \
924ba076 1471 /* FALLTHROUGH */ \
fab2782b
YO
1472 case trie_latin_utf8_fold: \
1473 if ( foldlen>0 ) { \
c80e42f3 1474 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1475 foldlen -= len; \
1476 uscan += len; \
1477 len=0; \
1478 } else { \
1479 len = 1; \
31f05a37 1480 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1481 skiplen = UNISKIP( uvc ); \
1482 foldlen -= skiplen; \
1483 uscan = foldbuf + skiplen; \
1484 } \
1485 break; \
1486 case trie_utf8: \
c80e42f3 1487 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1488 break; \
1489 case trie_plain: \
1490 uvc = (UV)*uc; \
1491 len = 1; \
1492 } \
1493 if (uvc < 256) { \
1494 charid = trie->charmap[ uvc ]; \
1495 } \
1496 else { \
1497 charid = 0; \
1498 if (widecharmap) { \
1499 SV** const svpp = hv_fetch(widecharmap, \
1500 (char*)&uvc, sizeof(UV), 0); \
1501 if (svpp) \
1502 charid = (U16)SvIV(*svpp); \
1503 } \
1504 } \
4cadc6a9
YO
1505} STMT_END
1506
4cadc6a9
YO
1507#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1508STMT_START { \
1509 while (s <= e) { \
1510 if ( (CoNd) \
fac1af77 1511 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1512 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1513 goto got_it; \
1514 s++; \
1515 } \
1516} STMT_END
1517
1518#define REXEC_FBC_UTF8_SCAN(CoDe) \
1519STMT_START { \
9a902117 1520 while (s < strend) { \
4cadc6a9 1521 CoDe \
9a902117 1522 s += UTF8SKIP(s); \
4cadc6a9
YO
1523 } \
1524} STMT_END
1525
1526#define REXEC_FBC_SCAN(CoDe) \
1527STMT_START { \
1528 while (s < strend) { \
1529 CoDe \
1530 s++; \
1531 } \
1532} STMT_END
1533
1534#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1535REXEC_FBC_UTF8_SCAN( \
1536 if (CoNd) { \
02d5137b 1537 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1538 goto got_it; \
1539 else \
1540 tmp = doevery; \
1541 } \
1542 else \
1543 tmp = 1; \
1544)
1545
1546#define REXEC_FBC_CLASS_SCAN(CoNd) \
1547REXEC_FBC_SCAN( \
1548 if (CoNd) { \
02d5137b 1549 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1550 goto got_it; \
1551 else \
1552 tmp = doevery; \
1553 } \
1554 else \
1555 tmp = 1; \
1556)
1557
baa60164 1558#define REXEC_FBC_TRYIT \
02d5137b 1559if ((reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1560 goto got_it
1561
e1d1eefb 1562#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
baa60164 1563 if (utf8_target) { \
e1d1eefb
YO
1564 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1565 } \
1566 else { \
1567 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1568 }
e1d1eefb 1569
baa60164 1570#define DUMP_EXEC_POS(li,s,doutf8) \
9d9163fb 1571 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
6d59b646 1572 startpos, doutf8)
786e8c11 1573
cfaf538b 1574
baa60164 1575#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1576 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
cfaf538b
KW
1577 tmp = TEST_NON_UTF8(tmp); \
1578 REXEC_FBC_UTF8_SCAN( \
ce1d15d5 1579 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
cfaf538b
KW
1580 tmp = !tmp; \
1581 IF_SUCCESS; \
1582 } \
1583 else { \
1584 IF_FAIL; \
1585 } \
1586 ); \
1587
baa60164 1588#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1589 if (s == reginfo->strbeg) { \
cfaf538b
KW
1590 tmp = '\n'; \
1591 } \
1592 else { \
9d9163fb 1593 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
3db24e1e
KW
1594 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1595 0, UTF8_ALLOW_DEFAULT); \
cfaf538b
KW
1596 } \
1597 tmp = TeSt1_UtF8; \
3db24e1e 1598 LOAD_UTF8_CHARCLASS_ALNUM(); \
cfaf538b 1599 REXEC_FBC_UTF8_SCAN( \
3db24e1e 1600 if (tmp == ! (TeSt2_UtF8)) { \
cfaf538b
KW
1601 tmp = !tmp; \
1602 IF_SUCCESS; \
1603 } \
1604 else { \
1605 IF_FAIL; \
1606 } \
1607 ); \
1608
63ac0dad
KW
1609/* The only difference between the BOUND and NBOUND cases is that
1610 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1611 * NBOUND. This is accomplished by passing it in either the if or else clause,
1612 * with the other one being empty */
1613#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1614 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1615
1616#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1617 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1618
1619#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1620 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1621
1622#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1623 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1624
63ac0dad
KW
1625
1626/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1627 * be passed in completely with the variable name being tested, which isn't
1628 * such a clean interface, but this is easier to read than it was before. We
1629 * are looking for the boundary (or non-boundary between a word and non-word
1630 * character. The utf8 and non-utf8 cases have the same logic, but the details
1631 * must be different. Find the "wordness" of the character just prior to this
1632 * one, and compare it with the wordness of this one. If they differ, we have
1633 * a boundary. At the beginning of the string, pretend that the previous
1634 * character was a new-line */
baa60164 1635#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1636 if (utf8_target) { \
baa60164 1637 UTF8_CODE \
63ac0dad
KW
1638 } \
1639 else { /* Not utf8 */ \
9d9163fb 1640 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad
KW
1641 tmp = TEST_NON_UTF8(tmp); \
1642 REXEC_FBC_SCAN( \
1643 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1644 tmp = !tmp; \
1645 IF_SUCCESS; \
1646 } \
1647 else { \
1648 IF_FAIL; \
1649 } \
1650 ); \
1651 } \
baa60164 1652 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
63ac0dad
KW
1653 goto got_it;
1654
786e8c11 1655/* We know what class REx starts with. Try to find this position... */
02d5137b 1656/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1657/* annoyingly all the vars in this routine have different names from their counterparts
1658 in regmatch. /grrr */
1659
3c3eec57 1660STATIC char *
07be1b83 1661S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1662 const char *strend, regmatch_info *reginfo)
a687059c 1663{
73104a1b
KW
1664 dVAR;
1665 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1666 char *pat_string; /* The pattern's exactish string */
1667 char *pat_end; /* ptr to end char of pat_string */
1668 re_fold_t folder; /* Function for computing non-utf8 folds */
1669 const U8 *fold_array; /* array for folding ords < 256 */
1670 STRLEN ln;
1671 STRLEN lnc;
73104a1b
KW
1672 U8 c1;
1673 U8 c2;
1674 char *e;
1675 I32 tmp = 1; /* Scratch variable? */
ba44c216 1676 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1677 UV utf8_fold_flags = 0;
f9176b44 1678 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1679 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1680 with a result inverts that result, as 0^1 =
1681 1 and 1^1 = 0 */
1682 _char_class_number classnum;
1683
73104a1b 1684 RXi_GET_DECL(prog,progi);
2f7f8cb1 1685
73104a1b 1686 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1687
73104a1b
KW
1688 /* We know what class it must start with. */
1689 switch (OP(c)) {
1690 case ANYOF:
1691 if (utf8_target) {
1692 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1693 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1694 }
1695 else {
1696 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1697 }
1698 break;
1699 case CANY:
1700 REXEC_FBC_SCAN(
02d5137b 1701 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1702 goto got_it;
1703 else
1704 tmp = doevery;
1705 );
1706 break;
1707
098b07d5
KW
1708 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1709 assert(! is_utf8_pat);
924ba076 1710 /* FALLTHROUGH */
73104a1b 1711 case EXACTFA:
984e6dd1 1712 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1713 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1714 goto do_exactf_utf8;
1715 }
1716 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1717 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1718 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1719
2fdb7295
KW
1720 case EXACTF: /* This node only generated for non-utf8 patterns */
1721 assert(! is_utf8_pat);
73104a1b 1722 if (utf8_target) {
73104a1b
KW
1723 utf8_fold_flags = 0;
1724 goto do_exactf_utf8;
1725 }
1726 fold_array = PL_fold;
1727 folder = foldEQ;
1728 goto do_exactf_non_utf8;
1729
1730 case EXACTFL:
31f05a37 1731 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1732 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1733 goto do_exactf_utf8;
1734 }
1735 fold_array = PL_fold_locale;
1736 folder = foldEQ_locale;
1737 goto do_exactf_non_utf8;
3c760661 1738
73104a1b 1739 case EXACTFU_SS:
984e6dd1 1740 if (is_utf8_pat) {
73104a1b
KW
1741 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1742 }
1743 goto do_exactf_utf8;
16d951b7 1744
73104a1b 1745 case EXACTFU:
984e6dd1
DM
1746 if (is_utf8_pat || utf8_target) {
1747 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1748 goto do_exactf_utf8;
1749 }
fac1af77 1750
73104a1b
KW
1751 /* Any 'ss' in the pattern should have been replaced by regcomp,
1752 * so we don't have to worry here about this single special case
1753 * in the Latin1 range */
1754 fold_array = PL_fold_latin1;
1755 folder = foldEQ_latin1;
1756
924ba076 1757 /* FALLTHROUGH */
73104a1b
KW
1758
1759 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1760 are no glitches with fold-length differences
1761 between the target string and pattern */
1762
1763 /* The idea in the non-utf8 EXACTF* cases is to first find the
1764 * first character of the EXACTF* node and then, if necessary,
1765 * case-insensitively compare the full text of the node. c1 is the
1766 * first character. c2 is its fold. This logic will not work for
1767 * Unicode semantics and the german sharp ss, which hence should
1768 * not be compiled into a node that gets here. */
1769 pat_string = STRING(c);
1770 ln = STR_LEN(c); /* length to match in octets/bytes */
1771
1772 /* We know that we have to match at least 'ln' bytes (which is the
1773 * same as characters, since not utf8). If we have to match 3
1774 * characters, and there are only 2 availabe, we know without
1775 * trying that it will fail; so don't start a match past the
1776 * required minimum number from the far end */
ea3daa5d 1777 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1778
02d5137b 1779 if (reginfo->intuit && e < s) {
73104a1b
KW
1780 e = s; /* Due to minlen logic of intuit() */
1781 }
fac1af77 1782
73104a1b
KW
1783 c1 = *pat_string;
1784 c2 = fold_array[c1];
1785 if (c1 == c2) { /* If char and fold are the same */
1786 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1787 }
1788 else {
1789 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1790 }
1791 break;
fac1af77 1792
73104a1b
KW
1793 do_exactf_utf8:
1794 {
1795 unsigned expansion;
1796
1797 /* If one of the operands is in utf8, we can't use the simpler folding
1798 * above, due to the fact that many different characters can have the
1799 * same fold, or portion of a fold, or different- length fold */
1800 pat_string = STRING(c);
1801 ln = STR_LEN(c); /* length to match in octets/bytes */
1802 pat_end = pat_string + ln;
984e6dd1 1803 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1804 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1805 : ln;
1806
1807 /* We have 'lnc' characters to match in the pattern, but because of
1808 * multi-character folding, each character in the target can match
1809 * up to 3 characters (Unicode guarantees it will never exceed
1810 * this) if it is utf8-encoded; and up to 2 if not (based on the
1811 * fact that the Latin 1 folds are already determined, and the
1812 * only multi-char fold in that range is the sharp-s folding to
1813 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1814 * string character. Adjust lnc accordingly, rounding up, so that
1815 * if we need to match at least 4+1/3 chars, that really is 5. */
1816 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1817 lnc = (lnc + expansion - 1) / expansion;
1818
1819 /* As in the non-UTF8 case, if we have to match 3 characters, and
1820 * only 2 are left, it's guaranteed to fail, so don't start a
1821 * match that would require us to go beyond the end of the string
1822 */
ea3daa5d 1823 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1824
02d5137b 1825 if (reginfo->intuit && e < s) {
73104a1b
KW
1826 e = s; /* Due to minlen logic of intuit() */
1827 }
0658cdde 1828
73104a1b
KW
1829 /* XXX Note that we could recalculate e to stop the loop earlier,
1830 * as the worst case expansion above will rarely be met, and as we
1831 * go along we would usually find that e moves further to the left.
1832 * This would happen only after we reached the point in the loop
1833 * where if there were no expansion we should fail. Unclear if
1834 * worth the expense */
1835
1836 while (s <= e) {
1837 char *my_strend= (char *)strend;
1838 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1839 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1840 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1841 {
1842 goto got_it;
1843 }
1844 s += (utf8_target) ? UTF8SKIP(s) : 1;
1845 }
1846 break;
1847 }
1848 case BOUNDL:
0eb30aeb 1849 FBC_BOUND(isWORDCHAR_LC,
f4cd282c 1850 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1851 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1852 break;
1853 case NBOUNDL:
0eb30aeb 1854 FBC_NBOUND(isWORDCHAR_LC,
f4cd282c 1855 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1856 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1857 break;
1858 case BOUND:
1859 FBC_BOUND(isWORDCHAR,
0eb30aeb 1860 isWORDCHAR_uni(tmp),
03940dc2 1861 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1862 break;
1863 case BOUNDA:
1864 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1865 isWORDCHAR_A(tmp),
1866 isWORDCHAR_A((U8*)s));
1867 break;
1868 case NBOUND:
1869 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1870 isWORDCHAR_uni(tmp),
03940dc2 1871 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1872 break;
1873 case NBOUNDA:
1874 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1875 isWORDCHAR_A(tmp),
1876 isWORDCHAR_A((U8*)s));
1877 break;
1878 case BOUNDU:
1879 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1880 isWORDCHAR_uni(tmp),
03940dc2 1881 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1882 break;
1883 case NBOUNDU:
1884 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1885 isWORDCHAR_uni(tmp),
03940dc2 1886 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1887 break;
73104a1b
KW
1888 case LNBREAK:
1889 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1890 is_LNBREAK_latin1_safe(s, strend)
1891 );
1892 break;
3018b823
KW
1893
1894 /* The argument to all the POSIX node types is the class number to pass to
1895 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1896
1897 case NPOSIXL:
1898 to_complement = 1;
1899 /* FALLTHROUGH */
1900
1901 case POSIXL:
3018b823
KW
1902 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1903 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1904 break;
3018b823
KW
1905
1906 case NPOSIXD:
1907 to_complement = 1;
1908 /* FALLTHROUGH */
1909
1910 case POSIXD:
1911 if (utf8_target) {
1912 goto posix_utf8;
1913 }
1914 goto posixa;
1915
1916 case NPOSIXA:
1917 if (utf8_target) {
1918 /* The complement of something that matches only ASCII matches all
837226c8
KW
1919 * non-ASCII, plus everything in ASCII that isn't in the class. */
1920 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
3018b823
KW
1921 || ! _generic_isCC_A(*s, FLAGS(c)));
1922 break;
1923 }
1924
1925 to_complement = 1;
1926 /* FALLTHROUGH */
1927
73104a1b 1928 case POSIXA:
3018b823 1929 posixa:
73104a1b 1930 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1931 * byte invariant character. */
1932 REXEC_FBC_CLASS_SCAN(
1933 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1934 break;
3018b823
KW
1935
1936 case NPOSIXU:
1937 to_complement = 1;
1938 /* FALLTHROUGH */
1939
1940 case POSIXU:
1941 if (! utf8_target) {
1942 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1943 FLAGS(c))));
1944 }
1945 else {
1946
1947 posix_utf8:
1948 classnum = (_char_class_number) FLAGS(c);
1949 if (classnum < _FIRST_NON_SWASH_CC) {
1950 while (s < strend) {
1951
1952 /* We avoid loading in the swash as long as possible, but
1953 * should we have to, we jump to a separate loop. This
1954 * extra 'if' statement is what keeps this code from being
1955 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1956 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1957 goto found_above_latin1;
1958 }
1959 if ((UTF8_IS_INVARIANT(*s)
1960 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1961 classnum)))
1962 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1963 && to_complement ^ cBOOL(
94bb8c36
KW
1964 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1965 *(s + 1)),
3018b823
KW
1966 classnum))))
1967 {
02d5137b 1968 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
1969 goto got_it;
1970 else {
1971 tmp = doevery;
1972 }
1973 }
1974 else {
1975 tmp = 1;
1976 }
1977 s += UTF8SKIP(s);
1978 }
1979 }
1980 else switch (classnum) { /* These classes are implemented as
1981 macros */
1982 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1983 revert the change of \v matching this */
924ba076 1984 /* FALLTHROUGH */
3018b823
KW
1985
1986 case _CC_ENUM_PSXSPC:
1987 REXEC_FBC_UTF8_CLASS_SCAN(
1988 to_complement ^ cBOOL(isSPACE_utf8(s)));
1989 break;
1990
1991 case _CC_ENUM_BLANK:
1992 REXEC_FBC_UTF8_CLASS_SCAN(
1993 to_complement ^ cBOOL(isBLANK_utf8(s)));
1994 break;
1995
1996 case _CC_ENUM_XDIGIT:
1997 REXEC_FBC_UTF8_CLASS_SCAN(
1998 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1999 break;
2000
2001 case _CC_ENUM_VERTSPACE:
2002 REXEC_FBC_UTF8_CLASS_SCAN(
2003 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2004 break;
2005
2006 case _CC_ENUM_CNTRL:
2007 REXEC_FBC_UTF8_CLASS_SCAN(
2008 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2009 break;
2010
2011 default:
2012 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2013 assert(0); /* NOTREACHED */
2014 }
2015 }
2016 break;
2017
2018 found_above_latin1: /* Here we have to load a swash to get the result
2019 for the current code point */
2020 if (! PL_utf8_swash_ptrs[classnum]) {
2021 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2022 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2023 _core_swash_init("utf8",
2024 "",
2025 &PL_sv_undef, 1, 0,
2026 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2027 }
2028
2029 /* This is a copy of the loop above for swash classes, though using the
2030 * FBC macro instead of being expanded out. Since we've loaded the
2031 * swash, we don't have to check for that each time through the loop */
2032 REXEC_FBC_UTF8_CLASS_SCAN(
2033 to_complement ^ cBOOL(_generic_utf8(
2034 classnum,
2035 s,
2036 swash_fetch(PL_utf8_swash_ptrs[classnum],
2037 (U8 *) s, TRUE))));
73104a1b
KW
2038 break;
2039
2040 case AHOCORASICKC:
2041 case AHOCORASICK:
2042 {
2043 DECL_TRIE_TYPE(c);
2044 /* what trie are we using right now */
2045 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2046 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2047 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2048
2049 const char *last_start = strend - trie->minlen;
6148ee25 2050#ifdef DEBUGGING
73104a1b 2051 const char *real_start = s;
6148ee25 2052#endif
73104a1b
KW
2053 STRLEN maxlen = trie->maxlen;
2054 SV *sv_points;
2055 U8 **points; /* map of where we were in the input string
2056 when reading a given char. For ASCII this
2057 is unnecessary overhead as the relationship
2058 is always 1:1, but for Unicode, especially
2059 case folded Unicode this is not true. */
2060 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2061 U8 *bitmap=NULL;
2062
2063
2064 GET_RE_DEBUG_FLAGS_DECL;
2065
2066 /* We can't just allocate points here. We need to wrap it in
2067 * an SV so it gets freed properly if there is a croak while
2068 * running the match */
2069 ENTER;
2070 SAVETMPS;
2071 sv_points=newSV(maxlen * sizeof(U8 *));
2072 SvCUR_set(sv_points,
2073 maxlen * sizeof(U8 *));
2074 SvPOK_on(sv_points);
2075 sv_2mortal(sv_points);
2076 points=(U8**)SvPV_nolen(sv_points );
2077 if ( trie_type != trie_utf8_fold
2078 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2079 {
2080 if (trie->bitmap)
2081 bitmap=(U8*)trie->bitmap;
2082 else
2083 bitmap=(U8*)ANYOF_BITMAP(c);
2084 }
2085 /* this is the Aho-Corasick algorithm modified a touch
2086 to include special handling for long "unknown char" sequences.
2087 The basic idea being that we use AC as long as we are dealing
2088 with a possible matching char, when we encounter an unknown char
2089 (and we have not encountered an accepting state) we scan forward
2090 until we find a legal starting char.
2091 AC matching is basically that of trie matching, except that when
2092 we encounter a failing transition, we fall back to the current
2093 states "fail state", and try the current char again, a process
2094 we repeat until we reach the root state, state 1, or a legal
2095 transition. If we fail on the root state then we can either
2096 terminate if we have reached an accepting state previously, or
2097 restart the entire process from the beginning if we have not.
2098
2099 */
2100 while (s <= last_start) {
2101 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2102 U8 *uc = (U8*)s;
2103 U16 charid = 0;
2104 U32 base = 1;
2105 U32 state = 1;
2106 UV uvc = 0;
2107 STRLEN len = 0;
2108 STRLEN foldlen = 0;
2109 U8 *uscan = (U8*)NULL;
2110 U8 *leftmost = NULL;
2111#ifdef DEBUGGING
2112 U32 accepted_word= 0;
786e8c11 2113#endif
73104a1b
KW
2114 U32 pointpos = 0;
2115
2116 while ( state && uc <= (U8*)strend ) {
2117 int failed=0;
2118 U32 word = aho->states[ state ].wordnum;
2119
2120 if( state==1 ) {
2121 if ( bitmap ) {
2122 DEBUG_TRIE_EXECUTE_r(
2123 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2124 dump_exec_pos( (char *)uc, c, strend, real_start,
2125 (char *)uc, utf8_target );
2126 PerlIO_printf( Perl_debug_log,
2127 " Scanning for legal start char...\n");
2128 }
2129 );
2130 if (utf8_target) {
2131 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2132 uc += UTF8SKIP(uc);
2133 }
2134 } else {
2135 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2136 uc++;
2137 }
786e8c11 2138 }
73104a1b 2139 s= (char *)uc;
07be1b83 2140 }
73104a1b
KW
2141 if (uc >(U8*)last_start) break;
2142 }
2143
2144 if ( word ) {
2145 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2146 if (!leftmost || lpos < leftmost) {
2147 DEBUG_r(accepted_word=word);
2148 leftmost= lpos;
7016d6eb 2149 }
73104a1b 2150 if (base==0) break;
7016d6eb 2151
73104a1b
KW
2152 }
2153 points[pointpos++ % maxlen]= uc;
2154 if (foldlen || uc < (U8*)strend) {
2155 REXEC_TRIE_READ_CHAR(trie_type, trie,
2156 widecharmap, uc,
2157 uscan, len, uvc, charid, foldlen,
2158 foldbuf, uniflags);
2159 DEBUG_TRIE_EXECUTE_r({
2160 dump_exec_pos( (char *)uc, c, strend,
2161 real_start, s, utf8_target);
2162 PerlIO_printf(Perl_debug_log,
2163 " Charid:%3u CP:%4"UVxf" ",
2164 charid, uvc);
2165 });
2166 }
2167 else {
2168 len = 0;
2169 charid = 0;
2170 }
07be1b83 2171
73104a1b
KW
2172
2173 do {
6148ee25 2174#ifdef DEBUGGING
73104a1b 2175 word = aho->states[ state ].wordnum;
6148ee25 2176#endif
73104a1b
KW
2177 base = aho->states[ state ].trans.base;
2178
2179 DEBUG_TRIE_EXECUTE_r({
2180 if (failed)
2181 dump_exec_pos( (char *)uc, c, strend, real_start,
2182 s, utf8_target );
2183 PerlIO_printf( Perl_debug_log,
2184 "%sState: %4"UVxf", word=%"UVxf,
2185 failed ? " Fail transition to " : "",
2186 (UV)state, (UV)word);
2187 });
2188 if ( base ) {
2189 U32 tmp;
2190 I32 offset;
2191 if (charid &&
2192 ( ((offset = base + charid
2193 - 1 - trie->uniquecharcount)) >= 0)
2194 && ((U32)offset < trie->lasttrans)
2195 && trie->trans[offset].check == state
2196 && (tmp=trie->trans[offset].next))
2197 {
2198 DEBUG_TRIE_EXECUTE_r(
2199 PerlIO_printf( Perl_debug_log," - legal\n"));
2200 state = tmp;
2201 break;
07be1b83
YO
2202 }
2203 else {
786e8c11 2204 DEBUG_TRIE_EXECUTE_r(
73104a1b 2205 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2206 failed = 1;
73104a1b 2207 state = aho->fail[state];
07be1b83 2208 }
07be1b83 2209 }
73104a1b
KW
2210 else {
2211 /* we must be accepting here */
2212 DEBUG_TRIE_EXECUTE_r(
2213 PerlIO_printf( Perl_debug_log," - accepting\n"));
2214 failed = 1;
2215 break;
786e8c11 2216 }
73104a1b
KW
2217 } while(state);
2218 uc += len;
2219 if (failed) {
2220 if (leftmost)
2221 break;
2222 if (!state) state = 1;
07be1b83 2223 }
73104a1b
KW
2224 }
2225 if ( aho->states[ state ].wordnum ) {
2226 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2227 if (!leftmost || lpos < leftmost) {
2228 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2229 leftmost = lpos;
07be1b83
YO
2230 }
2231 }
73104a1b
KW
2232 if (leftmost) {
2233 s = (char*)leftmost;
2234 DEBUG_TRIE_EXECUTE_r({
2235 PerlIO_printf(
2236 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2237 (UV)accepted_word, (IV)(s - real_start)
2238 );
2239 });
02d5137b 2240 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2241 FREETMPS;
2242 LEAVE;
2243 goto got_it;
2244 }
2245 s = HOPc(s,1);
2246 DEBUG_TRIE_EXECUTE_r({
2247 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2248 });
2249 } else {
2250 DEBUG_TRIE_EXECUTE_r(
2251 PerlIO_printf( Perl_debug_log,"No match.\n"));
2252 break;
2253 }
2254 }
2255 FREETMPS;
2256 LEAVE;
2257 }
2258 break;
2259 default:
2260 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2261 break;
2262 }
2263 return 0;
2264 got_it:
2265 return s;
6eb5f6b9
JH
2266}
2267
60165aa4
DM
2268/* set RX_SAVED_COPY, RX_SUBBEG etc.
2269 * flags have same meanings as with regexec_flags() */
2270
749f4950
DM
2271static void
2272S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2273 char *strbeg,
2274 char *strend,
2275 SV *sv,
2276 U32 flags,
2277 bool utf8_target)
2278{
2279 struct regexp *const prog = ReANY(rx);
2280
60165aa4
DM
2281 if (flags & REXEC_COPY_STR) {
2282#ifdef PERL_ANY_COW
2283 if (SvCANCOW(sv)) {
2284 if (DEBUG_C_TEST) {
2285 PerlIO_printf(Perl_debug_log,
2286 "Copy on write: regexp capture, type %d\n",
2287 (int) SvTYPE(sv));
2288 }
5411a0e5
DM
2289 /* Create a new COW SV to share the match string and store
2290 * in saved_copy, unless the current COW SV in saved_copy
2291 * is valid and suitable for our purpose */
2292 if (( prog->saved_copy
2293 && SvIsCOW(prog->saved_copy)
2294 && SvPOKp(prog->saved_copy)
2295 && SvIsCOW(sv)
2296 && SvPOKp(sv)
2297 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2298 {
5411a0e5
DM
2299 /* just reuse saved_copy SV */
2300 if (RXp_MATCH_COPIED(prog)) {
2301 Safefree(prog->subbeg);
2302 RXp_MATCH_COPIED_off(prog);
2303 }
2304 }
2305 else {
2306 /* create new COW SV to share string */
a76b0e90
DM
2307 RX_MATCH_COPY_FREE(rx);
2308 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2309 }
5411a0e5
DM
2310 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2311 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2312 prog->sublen = strend - strbeg;
2313 prog->suboffset = 0;
2314 prog->subcoffset = 0;
2315 } else
2316#endif
2317 {
99a90e59
FC
2318 SSize_t min = 0;
2319 SSize_t max = strend - strbeg;
ea3daa5d 2320 SSize_t sublen;
60165aa4
DM
2321
2322 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2323 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2324 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2325 ) { /* don't copy $' part of string */
2326 U32 n = 0;
2327 max = -1;
2328 /* calculate the right-most part of the string covered
2329 * by a capture. Due to look-ahead, this may be to
2330 * the right of $&, so we have to scan all captures */
2331 while (n <= prog->lastparen) {
2332 if (prog->offs[n].end > max)
2333 max = prog->offs[n].end;
2334 n++;
2335 }
2336 if (max == -1)
2337 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2338 ? prog->offs[0].start
2339 : 0;
2340 assert(max >= 0 && max <= strend - strbeg);
2341 }
2342
2343 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2344 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2345 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2346 ) { /* don't copy $` part of string */
2347 U32 n = 0;
2348 min = max;
2349 /* calculate the left-most part of the string covered
2350 * by a capture. Due to look-behind, this may be to
2351 * the left of $&, so we have to scan all captures */
2352 while (min && n <= prog->lastparen) {
2353 if ( prog->offs[n].start != -1
2354 && prog->offs[n].start < min)
2355 {
2356 min = prog->offs[n].start;
2357 }
2358 n++;
2359 }
2360 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2361 && min > prog->offs[0].end
2362 )
2363 min = prog->offs[0].end;
2364
2365 }
2366
2367 assert(min >= 0 && min <= max && min <= strend - strbeg);
2368 sublen = max - min;
2369
2370 if (RX_MATCH_COPIED(rx)) {
2371 if (sublen > prog->sublen)
2372 prog->subbeg =
2373 (char*)saferealloc(prog->subbeg, sublen+1);
2374 }
2375 else
2376 prog->subbeg = (char*)safemalloc(sublen+1);
2377 Copy(strbeg + min, prog->subbeg, sublen, char);
2378 prog->subbeg[sublen] = '\0';
2379 prog->suboffset = min;
2380 prog->sublen = sublen;
2381 RX_MATCH_COPIED_on(rx);
2382 }
2383 prog->subcoffset = prog->suboffset;
2384 if (prog->suboffset && utf8_target) {
2385 /* Convert byte offset to chars.
2386 * XXX ideally should only compute this if @-/@+
2387 * has been seen, a la PL_sawampersand ??? */
2388
2389 /* If there's a direct correspondence between the
2390 * string which we're matching and the original SV,
2391 * then we can use the utf8 len cache associated with
2392 * the SV. In particular, it means that under //g,
2393 * sv_pos_b2u() will use the previously cached
2394 * position to speed up working out the new length of
2395 * subcoffset, rather than counting from the start of
2396 * the string each time. This stops
2397 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2398 * from going quadratic */
2399 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2400 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2401 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2402 else
2403 prog->subcoffset = utf8_length((U8*)strbeg,
2404 (U8*)(strbeg+prog->suboffset));
2405 }
2406 }
2407 else {
2408 RX_MATCH_COPY_FREE(rx);
2409 prog->subbeg = strbeg;
2410 prog->suboffset = 0;
2411 prog->subcoffset = 0;
2412 prog->sublen = strend - strbeg;
2413 }
2414}
2415
2416
2417
fae667d5 2418
6eb5f6b9
JH
2419/*
2420 - regexec_flags - match a regexp against a string
2421 */
2422I32
5aaab254 2423Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2424 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2425/* stringarg: the point in the string at which to begin matching */
2426/* strend: pointer to null at end of string */
2427/* strbeg: real beginning of string */
2428/* minend: end of match must be >= minend bytes after stringarg. */
2429/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2430 * itself is accessed via the pointers above */
2431/* data: May be used for some additional optimizations.
d058ec57 2432 Currently unused. */
a340edde 2433/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2434
6eb5f6b9 2435{
97aff369 2436 dVAR;
8d919b0a 2437 struct regexp *const prog = ReANY(rx);
5aaab254 2438 char *s;
eb578fdb 2439 regnode *c;
03c83e26 2440 char *startpos;
ea3daa5d
FC
2441 SSize_t minlen; /* must match at least this many chars */
2442 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2443 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2444 I32 multiline;
f8fc2ecf 2445 RXi_GET_DECL(prog,progi);
02d5137b
DM
2446 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2447 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2448 regexp_paren_pair *swap = NULL;
006f26b2 2449 I32 oldsave;
a3621e74
YO
2450 GET_RE_DEBUG_FLAGS_DECL;
2451
7918f24d 2452 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2453 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2454
2455 /* Be paranoid... */
03c83e26 2456 if (prog == NULL || stringarg == NULL) {
6eb5f6b9
JH
2457 Perl_croak(aTHX_ "NULL regexp parameter");
2458 return 0;
2459 }
2460
6c3fea77 2461 DEBUG_EXECUTE_r(
03c83e26 2462 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2463 "Matching");
2464 );
8adc0f72 2465
b342a604
DM
2466 startpos = stringarg;
2467
58430ea8 2468 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2469 MAGIC *mg;
2470
fef7148b
DM
2471 /* set reginfo->ganch, the position where \G can match */
2472
2473 reginfo->ganch =
2474 (flags & REXEC_IGNOREPOS)
2475 ? stringarg /* use start pos rather than pos() */
2476 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2477 /* Defined pos(): */
2478 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2479 : strbeg; /* pos() not defined; use start of string */
2480
2481 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2482 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2483
03c83e26
DM
2484 /* in the presence of \G, we may need to start looking earlier in
2485 * the string than the suggested start point of stringarg:
0b2c2a84 2486 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2487 * offset, such as
2488 * /..\G/: gofs = 2
2489 * /ab|c\G/: gofs = 1
2490 * or if the minimum offset isn't known, then we have to go back
2491 * to the start of the string, e.g. /w+\G/
2492 */
2bfbe302 2493
8e1490ee 2494 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2495 startpos = reginfo->ganch - prog->gofs;
2496 if (startpos <
2497 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2498 {
2499 DEBUG_r(PerlIO_printf(Perl_debug_log,
2500 "fail: ganch-gofs before earliest possible start\n"));
2501 return 0;
2502 }
2503 }
2504 else if (prog->gofs) {
b342a604
DM
2505 if (startpos - prog->gofs < strbeg)
2506 startpos = strbeg;
2507 else
2508 startpos -= prog->gofs;
03c83e26 2509 }
58430ea8 2510 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2511 startpos = strbeg;
03c83e26
DM
2512 }
2513
2514 minlen = prog->minlen;
b342a604 2515 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2516 DEBUG_r(PerlIO_printf(Perl_debug_log,
2517 "Regex match can't succeed, so not even tried\n"));
2518 return 0;
2519 }
2520
63a3746a
DM
2521 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2522 * which will call destuctors to reset PL_regmatch_state, free higher
2523 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2524 * regmatch_info_aux_eval */
2525
2526 oldsave = PL_savestack_ix;
2527
dfa77d06
DM
2528 s = startpos;
2529
e322109a 2530 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2531 && !(flags & REXEC_CHECKED))
2532 {
dfa77d06 2533 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2534 flags, NULL);
dfa77d06 2535 if (!s)
7fadf4a7
DM
2536 return 0;
2537
e322109a 2538 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2539 /* we can match based purely on the result of INTUIT.
2540 * Set up captures etc just for $& and $-[0]
2541 * (an intuit-only match wont have $1,$2,..) */
2542 assert(!prog->nparens);
d5e7783a
DM
2543
2544 /* s/// doesn't like it if $& is earlier than where we asked it to
2545 * start searching (which can happen on something like /.\G/) */
2546 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2547 && (s < stringarg))
2548 {
2549 /* this should only be possible under \G */
58430ea8 2550 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2551 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2552 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2553 goto phooey;
2554 }
2555
7fadf4a7
DM
2556 /* match via INTUIT shouldn't have any captures.
2557 * Let @-, @+, $^N know */
2558 prog->lastparen = prog->lastcloseparen = 0;
2559 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2560 prog->offs[0].start = s - strbeg;
2561 prog->offs[0].end = utf8_target
2562 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2563 : s - strbeg + prog->minlenret;
7fadf4a7 2564 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2565 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2566 strbeg, strend,
2567 sv, flags, utf8_target);
2568
7fadf4a7
DM
2569 return 1;
2570 }
2571 }
2572
6c3fea77 2573 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2574
dfa77d06 2575 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2576 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2577 "String too short [regexec_flags]...\n"));
2578 goto phooey;
1aa99e6b 2579 }
1de06328 2580
6eb5f6b9 2581 /* Check validity of program. */
f8fc2ecf 2582 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2583 Perl_croak(aTHX_ "corrupted regexp program");
2584 }
2585
1738e041
DM
2586 RX_MATCH_TAINTED_off(rx);
2587
6c3fea77
DM
2588 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2589 reginfo->intuit = 0;
2590 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2591 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2592 reginfo->warned = FALSE;
9d9163fb 2593 reginfo->strbeg = strbeg;
02d5137b 2594 reginfo->sv = sv;
1cb48e53 2595 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2596 reginfo->strend = strend;
6eb5f6b9 2597 /* see how far we have to get to not match where we matched before */
fe3974be 2598 reginfo->till = stringarg + minend;
6eb5f6b9 2599
60779a30 2600 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
2601 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2602 S_cleanup_regmatch_info_aux has executed (registered by
2603 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2604 magic belonging to this SV.
2605 Not newSVsv, either, as it does not COW.
2606 */
60779a30 2607 assert(!IS_PADGV(sv));
82c23608 2608 reginfo->sv = newSV(0);
4cba5ac0 2609 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2610 SAVEFREESV(reginfo->sv);
2611 }
2612
331b2dcc
DM
2613 /* reserve next 2 or 3 slots in PL_regmatch_state:
2614 * slot N+0: may currently be in use: skip it
2615 * slot N+1: use for regmatch_info_aux struct
2616 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2617 * slot N+3: ready for use by regmatch()
2618 */
bf2039a9 2619
331b2dcc
DM
2620 {
2621 regmatch_state *old_regmatch_state;
2622 regmatch_slab *old_regmatch_slab;
2623 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2624
2625 /* on first ever match, allocate first slab */
2626 if (!PL_regmatch_slab) {
2627 Newx(PL_regmatch_slab, 1, regmatch_slab);
2628 PL_regmatch_slab->prev = NULL;
2629 PL_regmatch_slab->next = NULL;
2630 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2631 }
bf2039a9 2632
331b2dcc
DM
2633 old_regmatch_state = PL_regmatch_state;
2634 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2635
331b2dcc
DM
2636 for (i=0; i <= max; i++) {
2637 if (i == 1)
2638 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2639 else if (i ==2)
2640 reginfo->info_aux_eval =
2641 reginfo->info_aux->info_aux_eval =
2642 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2643
331b2dcc
DM
2644 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2645 PL_regmatch_state = S_push_slab(aTHX);
2646 }
bf2039a9 2647
331b2dcc
DM
2648 /* note initial PL_regmatch_state position; at end of match we'll
2649 * pop back to there and free any higher slabs */
bf2039a9 2650
331b2dcc
DM
2651 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2652 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2653 reginfo->info_aux->poscache = NULL;
bf2039a9 2654
331b2dcc 2655 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2656
331b2dcc
DM
2657 if ((prog->extflags & RXf_EVAL_SEEN))
2658 S_setup_eval_state(aTHX_ reginfo);
2659 else
2660 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2661 }
d3aa529c 2662
6eb5f6b9 2663 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2664
288b8c02 2665 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2666 /* We have to be careful. If the previous successful match
2667 was from this regex we don't want a subsequent partially
2668 successful match to clobber the old results.
2669 So when we detect this possibility we add a swap buffer
d8da0584
KW
2670 to the re, and switch the buffer each match. If we fail,
2671 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2672 */
2673 swap = prog->offs;
2674 /* do we need a save destructor here for eval dies? */
2675 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2676 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2677 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2678 PTR2UV(prog),
2679 PTR2UV(swap),
2680 PTR2UV(prog->offs)
2681 ));
c74340f9 2682 }
6eb5f6b9
JH
2683
2684 /* Simplest case: anchored match need be tried only once. */
2685 /* [unless only anchor is BOL and multiline is set] */
8e1490ee 2686 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3542935d 2687 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2688 goto got_it;
8e1490ee 2689 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
6eb5f6b9
JH
2690 {
2691 char *end;
2692
2693 if (minlen)
2694 dontbother = minlen - 1;
1aa99e6b 2695 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2696 /* for multiline we only have to try after newlines */
33b8afdf 2697 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2698 /* because of the goto we can not easily reuse the macros for bifurcating the
2699 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2700 if (utf8_target) {
2701 if (s == startpos)
2702 goto after_try_utf8;
2703 while (1) {
02d5137b 2704 if (regtry(reginfo, &s)) {
92f3d482
YO
2705 goto got_it;
2706 }
2707 after_try_utf8:
2708 if (s > end) {
2709 goto phooey;
2710 }
2711 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2712 s = re_intuit_start(rx, sv, strbeg,
2713 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2714 if (!s) {
2715 goto phooey;
2716 }
2717 }
2718 else {
2719 s += UTF8SKIP(s);
2720 }
2721 }
2722 } /* end search for check string in unicode */
2723 else {
2724 if (s == startpos) {
2725 goto after_try_latin;
2726 }
2727 while (1) {
02d5137b 2728 if (regtry(reginfo, &s)) {
92f3d482
YO
2729 goto got_it;
2730 }
2731 after_try_latin:
2732 if (s > end) {
2733 goto phooey;
2734 }
2735 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2736 s = re_intuit_start(rx, sv, strbeg,
2737 s + 1, strend, flags, NULL);
92f3d482
YO
2738 if (!s) {
2739 goto phooey;
2740 }
2741 }
2742 else {
2743 s++;
2744 }
2745 }
2746 } /* end search for check string in latin*/
2747 } /* end search for check string */
2748 else { /* search for newline */
2749 if (s > startpos) {
2750 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2751 s--;
92f3d482 2752 }
21eede78
YO
2753 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2754 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2755 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2756 if (regtry(reginfo, &s))
6eb5f6b9
JH
2757 goto got_it;
2758 }
92f3d482
YO
2759 }
2760 } /* end search for newline */
2761 } /* end anchored/multiline check string search */
6eb5f6b9 2762 goto phooey;
a8430a8b 2763 } else if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 2764 {
a8430a8b
YO
2765 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2766 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
2767 /* For anchored \G, the only position it can match from is
2768 * (ganch-gofs); we already set startpos to this above; if intuit
2769 * moved us on from there, we can't possibly succeed */
2770 assert(startpos == reginfo->ganch - prog->gofs);
2771 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2772 goto got_it;
2773 goto phooey;
2774 }
2775
2776 /* Messy cases: unanchored match. */
bbe252da 2777 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2778 /* we have /x+whatever/ */
984e6dd1 2779 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2780 char ch;
bf93d4cc
GS
2781#ifdef DEBUGGING
2782 int did_match = 0;
2783#endif
f2ed9b32 2784 if (utf8_target) {
7e0d5ad7
KW
2785 if (! prog->anchored_utf8) {
2786 to_utf8_substr(prog);
2787 }
2788 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2789 REXEC_FBC_SCAN(
6eb5f6b9 2790 if (*s == ch) {
a3621e74 2791 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2792 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2793 s += UTF8SKIP(s);
2794 while (s < strend && *s == ch)
2795 s += UTF8SKIP(s);
2796 }
4cadc6a9 2797 );
7e0d5ad7 2798
6eb5f6b9
JH
2799 }
2800 else {
7e0d5ad7
KW
2801 if (! prog->anchored_substr) {
2802 if (! to_byte_substr(prog)) {
6b54ddc5 2803 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2804 }
2805 }
2806 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2807 REXEC_FBC_SCAN(
6eb5f6b9 2808 if (*s == ch) {
a3621e74 2809 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2810 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2811 s++;
2812 while (s < strend && *s == ch)
2813 s++;
2814 }
4cadc6a9 2815 );
6eb5f6b9 2816 }
a3621e74 2817 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2818 PerlIO_printf(Perl_debug_log,
b7953727
JH
2819 "Did not find anchored character...\n")
2820 );
6eb5f6b9 2821 }
a0714e2c
SS
2822 else if (prog->anchored_substr != NULL
2823 || prog->anchored_utf8 != NULL
2824 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2825 && prog->float_max_offset < strend - s)) {
2826 SV *must;
ea3daa5d
FC
2827 SSize_t back_max;
2828 SSize_t back_min;
33b8afdf 2829 char *last;
6eb5f6b9 2830 char *last1; /* Last position checked before */
bf93d4cc
GS
2831#ifdef DEBUGGING
2832 int did_match = 0;
2833#endif
33b8afdf 2834 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2835 if (utf8_target) {
2836 if (! prog->anchored_utf8) {
2837 to_utf8_substr(prog);
2838 }
2839 must = prog->anchored_utf8;
2840 }
2841 else {
2842 if (! prog->anchored_substr) {
2843 if (! to_byte_substr(prog)) {
6b54ddc5 2844 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2845 }
2846 }
2847 must = prog->anchored_substr;
2848 }
33b8afdf
JH
2849 back_max = back_min = prog->anchored_offset;
2850 } else {
7e0d5ad7
KW
2851 if (utf8_target) {
2852 if (! prog->float_utf8) {
2853 to_utf8_substr(prog);
2854 }
2855 must = prog->float_utf8;
2856 }
2857 else {
2858 if (! prog->float_substr) {
2859 if (! to_byte_substr(prog)) {
6b54ddc5 2860 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2861 }
2862 }
2863 must = prog->float_substr;
2864 }
33b8afdf
JH
2865 back_max = prog->float_max_offset;
2866 back_min = prog->float_min_offset;
2867 }
1de06328 2868
1de06328
YO
2869 if (back_min<0) {
2870 last = strend;
2871 } else {
2872 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 2873 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
2874 - (SvTAIL(must) != 0) + back_min), strbeg);
2875 }
9d9163fb 2876 if (s > reginfo->strbeg)
6eb5f6b9
JH
2877 last1 = HOPc(s, -1);
2878 else
2879 last1 = s - 1; /* bogus */
2880
a0288114 2881 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2882 check_substr==must. */
bf05793b 2883 dontbother = 0;
6eb5f6b9
JH
2884 strend = HOPc(strend, -dontbother);
2885 while ( (s <= last) &&
e50d57d4 2886 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 2887 (unsigned char*)strend, must,
c33e64f0 2888 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2889 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2890 if (HOPc(s, -back_max) > last1) {
2891 last1 = HOPc(s, -back_min);
2892 s = HOPc(s, -back_max);
2893 }
2894 else {
9d9163fb
DM
2895 char * const t = (last1 >= reginfo->strbeg)
2896 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2897
2898 last1 = HOPc(s, -back_min);
52657f30 2899 s = t;
6eb5f6b9 2900 }
f2ed9b32 2901 if (utf8_target) {
6eb5f6b9 2902 while (s <= last1) {
02d5137b 2903 if (regtry(reginfo, &s))
6eb5f6b9 2904 goto got_it;
7016d6eb
DM
2905 if (s >= last1) {
2906 s++; /* to break out of outer loop */
2907 break;
2908 }
2909 s += UTF8SKIP(s);
6eb5f6b9
JH
2910 }
2911 }
2912 else {
2913 while (s <= last1) {
02d5137b 2914 if (regtry(reginfo, &s))
6eb5f6b9
JH
2915 goto got_it;
2916 s++;
2917 }
2918 }
2919 }
ab3bbdeb 2920 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2921 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2922 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2923 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2924 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2925 ? "anchored" : "floating"),
ab3bbdeb
YO
2926 quoted, RE_SV_TAIL(must));
2927 });
6eb5f6b9
JH
2928 goto phooey;
2929 }
f8fc2ecf 2930 else if ( (c = progi->regstclass) ) {
f14c76ed 2931 if (minlen) {
f8fc2ecf 2932 const OPCODE op = OP(progi->regstclass);
66e933ab 2933 /* don't bother with what can't match */
786e8c11 2934 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2935 strend = HOPc(strend, -(minlen - 1));
2936 }
a3621e74 2937 DEBUG_EXECUTE_r({
be8e71aa 2938 SV * const prop = sv_newmortal();
2395827c 2939 regprop(prog, prop, c, reginfo);
0df25f3d 2940 {
f2ed9b32 2941 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2942 s,strend-s,60);
0df25f3d 2943 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2944 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2945 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2946 quoted, (int)(strend - s));
0df25f3d 2947 }
ffc61ed2 2948 });
f9176b44 2949 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2950 goto got_it;
07be1b83 2951 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2952 }
2953 else {
2954 dontbother = 0;
a0714e2c 2955 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2956 /* Trim the end. */
6af40bd7 2957 char *last= NULL;
33b8afdf 2958 SV* float_real;
c33e64f0
FC
2959 STRLEN len;
2960 const char *little;
33b8afdf 2961
7e0d5ad7
KW
2962 if (utf8_target) {
2963 if (! prog->float_utf8) {
2964 to_utf8_substr(prog);
2965 }
2966 float_real = prog->float_utf8;
2967 }
2968 else {
2969 if (! prog->float_substr) {
2970 if (! to_byte_substr(prog)) {
6b54ddc5 2971 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2972 }
2973 }
2974 float_real = prog->float_substr;
2975 }
d6a28714 2976
c33e64f0
FC
2977 little = SvPV_const(float_real, len);
2978 if (SvTAIL(float_real)) {
7f18ad16
KW
2979 /* This means that float_real contains an artificial \n on
2980 * the end due to the presence of something like this:
2981 * /foo$/ where we can match both "foo" and "foo\n" at the
2982 * end of the string. So we have to compare the end of the
2983 * string first against the float_real without the \n and
2984 * then against the full float_real with the string. We
2985 * have to watch out for cases where the string might be
2986 * smaller than the float_real or the float_real without
2987 * the \n. */
1a13b075
YO
2988 char *checkpos= strend - len;
2989 DEBUG_OPTIMISE_r(
2990 PerlIO_printf(Perl_debug_log,
2991 "%sChecking for float_real.%s\n",
2992 PL_colors[4], PL_colors[5]));
2993 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2994 /* can't match, even if we remove the trailing \n
2995 * string is too short to match */
1a13b075
YO
2996 DEBUG_EXECUTE_r(
2997 PerlIO_printf(Perl_debug_log,
2998 "%sString shorter than required trailing substring, cannot match.%s\n",
2999 PL_colors[4], PL_colors[5]));
3000 goto phooey;
3001 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
3002 /* can match, the end of the string matches without the
3003 * "\n" */
1a13b075
YO
3004 last = checkpos + 1;
3005 } else if (checkpos < strbeg) {
7f18ad16
KW
3006 /* cant match, string is too short when the "\n" is
3007 * included */
1a13b075
YO
3008 DEBUG_EXECUTE_r(
3009 PerlIO_printf(Perl_debug_log,
3010 "%sString does not contain required trailing substring, cannot match.%s\n",
3011 PL_colors[4], PL_colors[5]));
3012 goto phooey;
3013 } else if (!multiline) {
7f18ad16
KW
3014 /* non multiline match, so compare with the "\n" at the
3015 * end of the string */
1a13b075
YO
3016 if (memEQ(checkpos, little, len)) {
3017 last= checkpos;
3018 } else {
3019 DEBUG_EXECUTE_r(
3020 PerlIO_printf(Perl_debug_log,
3021 "%sString does not contain required trailing substring, cannot match.%s\n",
3022 PL_colors[4], PL_colors[5]));
3023 goto phooey;
3024 }
3025 } else {
7f18ad16
KW
3026 /* multiline match, so we have to search for a place
3027 * where the full string is located */
d6a28714 3028 goto find_last;
1a13b075 3029 }
c33e64f0 3030 } else {
d6a28714 3031 find_last:
9041c2e3 3032 if (len)
d6a28714 3033 last = rninstr(s, strend, little, little + len);
b8c5462f 3034 else
a0288114 3035 last = strend; /* matching "$" */
b8c5462f 3036 }
6af40bd7 3037 if (!last) {
7f18ad16
KW
3038 /* at one point this block contained a comment which was
3039 * probably incorrect, which said that this was a "should not
3040 * happen" case. Even if it was true when it was written I am
3041 * pretty sure it is not anymore, so I have removed the comment
3042 * and replaced it with this one. Yves */
6bda09f9
YO
3043 DEBUG_EXECUTE_r(
3044 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
3045 "String does not contain required substring, cannot match.\n"
3046 ));
3047 goto phooey;
bf93d4cc 3048 }
d6a28714
JH
3049 dontbother = strend - last + prog->float_min_offset;
3050 }
3051 if (minlen && (dontbother < minlen))
3052 dontbother = minlen - 1;
3053 strend -= dontbother; /* this one's always in bytes! */
3054 /* We don't know much -- general case. */
f2ed9b32 3055 if (utf8_target) {
d6a28714 3056 for (;;) {
02d5137b 3057 if (regtry(reginfo, &s))
d6a28714
JH
3058 goto got_it;
3059 if (s >= strend)
3060 break;
b8c5462f 3061 s += UTF8SKIP(s);
d6a28714
JH
3062 };
3063 }
3064 else {
3065 do {
02d5137b 3066 if (regtry(reginfo, &s))
d6a28714
JH
3067 goto got_it;
3068 } while (s++ < strend);
3069 }
3070 }
3071
3072 /* Failure. */
3073 goto phooey;
3074
3075got_it:
d5e7783a
DM
3076 /* s/// doesn't like it if $& is earlier than where we asked it to
3077 * start searching (which can happen on something like /.\G/) */
3078 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3079 && (prog->offs[0].start < stringarg - strbeg))
3080 {
3081 /* this should only be possible under \G */
58430ea8 3082 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3083 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3084 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3085 goto phooey;
3086 }
3087
495f47a5
DM
3088 DEBUG_BUFFERS_r(
3089 if (swap)
3090 PerlIO_printf(Perl_debug_log,
3091 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3092 PTR2UV(prog),
3093 PTR2UV(swap)
3094 );
3095 );
e9105d30 3096 Safefree(swap);
d6a28714 3097
bf2039a9
DM
3098 /* clean up; this will trigger destructors that will free all slabs
3099 * above the current one, and cleanup the regmatch_info_aux
3100 * and regmatch_info_aux_eval sructs */
8adc0f72 3101
006f26b2
DM
3102 LEAVE_SCOPE(oldsave);
3103
5daac39c
NC
3104 if (RXp_PAREN_NAMES(prog))
3105 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 3106
0254aed9
DM
3107 RX_MATCH_UTF8_set(rx, utf8_target);
3108
d6a28714 3109 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3110 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3111 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3112 strbeg, reginfo->strend,
3113 sv, flags, utf8_target);
9041c2e3 3114
d6a28714
JH
3115 return 1;
3116
3117phooey:
a3621e74 3118 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3119 PL_colors[4], PL_colors[5]));
8adc0f72 3120
bf2039a9
DM
3121 /* clean up; this will trigger destructors that will free all slabs
3122 * above the current one, and cleanup the regmatch_info_aux
3123 * and regmatch_info_aux_eval sructs */
8adc0f72 3124
006f26b2
DM
3125 LEAVE_SCOPE(oldsave);
3126
e9105d30 3127 if (swap) {
c74340f9 3128 /* we failed :-( roll it back */
495f47a5
DM
3129 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3130 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3131 PTR2UV(prog),
3132 PTR2UV(prog->offs),
3133 PTR2UV(swap)
3134 ));
e9105d30
GG
3135 Safefree(prog->offs);
3136 prog->offs = swap;
3137 }
d6a28714
JH
3138 return 0;
3139}
3140
6bda09f9 3141
b3d298be 3142/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3143 * Do inc before dec, in case old and new rex are the same */
baa60164 3144#define SET_reg_curpm(Re2) \
bf2039a9 3145 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3146 (void)ReREFCNT_inc(Re2); \
3147 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3148 PM_SETRE((PL_reg_curpm), (Re2)); \
3149 }
3150
3151
d6a28714
JH
3152/*
3153 - regtry - try match at specific point
3154 */
3155STATIC I32 /* 0 failure, 1 success */
f73aaa43 3156S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3157{
97aff369 3158 dVAR;
d6a28714 3159 CHECKPOINT lastcp;
288b8c02 3160 REGEXP *const rx = reginfo->prog;
8d919b0a 3161 regexp *const prog = ReANY(rx);
99a90e59 3162 SSize_t result;
f8fc2ecf 3163 RXi_GET_DECL(prog,progi);
a3621e74 3164 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3165
3166 PERL_ARGS_ASSERT_REGTRY;
3167
24b23f37 3168 reginfo->cutpoint=NULL;
d6a28714 3169
9d9163fb 3170 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3171 prog->lastparen = 0;
03994de8 3172 prog->lastcloseparen = 0;
d6a28714
JH
3173
3174 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3175 to do this again and again, prog->lastparen should take care of
3dd2943c 3176 this! --ilya*/
dafc8851
JH
3177
3178 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3179 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3180 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3181 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3182 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3183 * Meanwhile, this code *is* needed for the
daf18116
JH
3184 * above-mentioned test suite tests to succeed. The common theme
3185 * on those tests seems to be returning null fields from matches.
225593e1 3186 * --jhi updated by dapm */
dafc8851 3187#if 1
d6a28714 3188 if (prog->nparens) {
b93070ed 3189 regexp_paren_pair *pp = prog->offs;
eb578fdb 3190 I32 i;
b93070ed 3191 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3192 ++pp;
3193 pp->start = -1;
3194 pp->end = -1;
d6a28714
JH
3195 }
3196 }
dafc8851 3197#endif
02db2b7b 3198 REGCP_SET(lastcp);
f73aaa43
DM
3199 result = regmatch(reginfo, *startposp, progi->program + 1);
3200 if (result != -1) {
3201 prog->offs[0].end = result;
d6a28714
JH
3202 return 1;
3203 }
24b23f37 3204 if (reginfo->cutpoint)
f73aaa43 3205 *startposp= reginfo->cutpoint;
02db2b7b 3206 REGCP_UNWIND(lastcp);
d6a28714
JH
3207 return 0;
3208}
3209
02db2b7b 3210
8ba1375e
MJD
3211#define sayYES goto yes
3212#define sayNO goto no
262b90c4 3213#define sayNO_SILENT goto no_silent
8ba1375e 3214
f9f4320a
YO
3215/* we dont use STMT_START/END here because it leads to
3216 "unreachable code" warnings, which are bogus, but distracting. */
3217#define CACHEsayNO \
c476f425 3218 if (ST.cache_mask) \
2ac8ff4b 3219 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3220 sayNO
3298f257 3221
a3621e74 3222/* this is used to determine how far from the left messages like
265c4333
YO
3223 'failed...' are printed. It should be set such that messages
3224 are inline with the regop output that created them.
a3621e74 3225*/
265c4333 3226#define REPORT_CODE_OFF 32
a3621e74
YO
3227
3228
40a82448
DM
3229#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3230#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3231#define CHRTEST_NOT_A_CP_1 -999
3232#define CHRTEST_NOT_A_CP_2 -998
9e137952 3233
5d9a96ca
DM
3234/* grab a new slab and return the first slot in it */
3235
3236STATIC regmatch_state *
3237S_push_slab(pTHX)
3238{
a35a87e7 3239#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3240 dMY_CXT;
3241#endif
5d9a96ca
DM
3242 regmatch_slab *s = PL_regmatch_slab->next;
3243 if (!s) {
3244 Newx(s, 1, regmatch_slab);
3245 s->prev = PL_regmatch_slab;
3246 s->next = NULL;
3247 PL_regmatch_slab->next = s;
3248 }
3249 PL_regmatch_slab = s;
86545054 3250 return SLAB_FIRST(s);
5d9a96ca 3251}
5b47454d 3252
95b24440 3253
40a82448
DM
3254/* push a new state then goto it */
3255
4d5016e5
DM
3256#define PUSH_STATE_GOTO(state, node, input) \
3257 pushinput = input; \
40a82448
DM
3258 scan = node; \
3259 st->resume_state = state; \
3260 goto push_state;
3261
3262/* push a new state with success backtracking, then goto it */
3263
4d5016e5
DM
3264#define PUSH_YES_STATE_GOTO(state, node, input) \
3265 pushinput = input; \
40a82448
DM
3266 scan = node; \
3267 st->resume_state = state; \
3268 goto push_yes_state;
3269
aa283a38 3270
aa283a38 3271
4d5016e5 3272
d6a28714 3273/*
95b24440 3274
bf1f174e
DM
3275regmatch() - main matching routine
3276
3277This is basically one big switch statement in a loop. We execute an op,
3278set 'next' to point the next op, and continue. If we come to a point which
3279we may need to backtrack to on failure such as (A|B|C), we push a
3280backtrack state onto the backtrack stack. On failure, we pop the top
3281state, and re-enter the loop at the state indicated. If there are no more
3282states to pop, we return failure.
3283
3284Sometimes we also need to backtrack on success; for example /A+/, where
3285after successfully matching one A, we need to go back and try to
3286match another one; similarly for lookahead assertions: if the assertion
3287completes successfully, we backtrack to the state just before the assertion
3288and then carry on. In these cases, the pushed state is marked as
3289'backtrack on success too'. This marking is in fact done by a chain of
3290pointers, each pointing to the previous 'yes' state. On success, we pop to
3291the nearest yes state, discarding any intermediate failure-only states.
3292Sometimes a yes state is pushed just to force some cleanup code to be
3293called at the end of a successful match or submatch; e.g. (??{$re}) uses
3294it to free the inner regex.
3295
3296Note that failure backtracking rewinds the cursor position, while
3297success backtracking leaves it alone.
3298
3299A pattern is complete when the END op is executed, while a subpattern
3300such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3301ops trigger the "pop to last yes state if any, otherwise return true"
3302behaviour.
3303
3304A common convention in this function is to use A and B to refer to the two
3305subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3306the subpattern to be matched possibly multiple times, while B is the entire
3307rest of the pattern. Variable and state names reflect this convention.
3308
3309The states in the main switch are the union of ops and failure/success of
3310substates associated with with that op. For example, IFMATCH is the op
3311that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3312'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3313successfully matched A and IFMATCH_A_fail is a state saying that we have
3314just failed to match A. Resume states always come in pairs. The backtrack
3315state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3316at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3317on success or failure.
3318
3319The struct that holds a backtracking state is actually a big union, with
3320one variant for each major type of op. The variable st points to the
3321top-most backtrack struct. To make the code clearer, within each
3322block of code we #define ST to alias the relevant union.
3323
3324Here's a concrete example of a (vastly oversimplified) IFMATCH
3325implementation:
3326
3327 switch (state) {
3328 ....
3329
3330#define ST st->u.ifmatch
3331
3332 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3333 ST.foo = ...; // some state we wish to save
95b24440 3334 ...
bf1f174e
DM
3335 // push a yes backtrack state with a resume value of
3336 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3337 // first node of A:
4d5016e5 3338 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3339 // NOTREACHED
3340
3341 case IFMATCH_A: // we have successfully executed A; now continue with B
3342 next = B;
3343 bar = ST.foo; // do something with the preserved value
3344 break;
3345
3346 case IFMATCH_A_fail: // A failed, so the assertion failed
3347 ...; // do some housekeeping, then ...
3348 sayNO; // propagate the failure
3349
3350#undef ST
95b24440 3351
bf1f174e
DM
3352 ...
3353 }
95b24440 3354
bf1f174e
DM
3355For any old-timers reading this who are familiar with the old recursive
3356approach, the code above is equivalent to:
95b24440 3357
bf1f174e
DM
3358 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3359 {
3360 int foo = ...
95b24440 3361 ...
bf1f174e
DM
3362 if (regmatch(A)) {
3363 next = B;
3364 bar = foo;
3365 break;
95b24440 3366 }
bf1f174e
DM
3367 ...; // do some housekeeping, then ...
3368 sayNO; // propagate the failure
95b24440 3369 }
bf1f174e
DM
3370
3371The topmost backtrack state, pointed to by st, is usually free. If you
3372want to claim it, populate any ST.foo fields in it with values you wish to
3373save, then do one of
3374
4d5016e5
DM
3375 PUSH_STATE_GOTO(resume_state, node, newinput);
3376 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3377
3378which sets that backtrack state's resume value to 'resume_state', pushes a
3379new free entry to the top of the backtrack stack, then goes to 'node'.
3380On backtracking, the free slot is popped, and the saved state becomes the
3381new free state. An ST.foo field in this new top state can be temporarily
3382accessed to retrieve values, but once the main loop is re-entered, it
3383becomes available for reuse.
3384
3385Note that the depth of the backtrack stack constantly increases during the
3386left-to-right execution of the pattern, rather than going up and down with
3387the pattern nesting. For example the stack is at its maximum at Z at the
3388end of the pattern, rather than at X in the following:
3389
3390 /(((X)+)+)+....(Y)+....Z/
3391
3392The only exceptions to this are lookahead/behind assertions and the cut,
3393(?>A), which pop all the backtrack states associated with A before
3394continuing.
3395
486ec47a 3396Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3397PL_regmatch_state and st always point to the currently active state,
3398and PL_regmatch_slab points to the slab currently containing
3399PL_regmatch_state. The first time regmatch() is called, the first slab is
3400allocated, and is never freed until interpreter destruction. When the slab
3401is full, a new one is allocated and chained to the end. At exit from
3402regmatch(), slabs allocated since entry are freed.
3403
3404*/
95b24440 3405
40a82448 3406
5bc10b2c 3407#define DEBUG_STATE_pp(pp) \
265c4333 3408 DEBUG_STATE_r({ \
baa60164 3409 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3410 PerlIO_printf(Perl_debug_log, \
5d458dd8 3411 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3412 depth*2, "", \
baa60164 3413 PL_reg_name[st->resume_state], \
5d458dd8
YO
3414 ((st==yes_state||st==mark_state) ? "[" : ""), \
3415 ((st==yes_state) ? "Y" : ""), \
3416 ((st==mark_state) ? "M" : ""), \
3417 ((st==yes_state||st==mark_state) ? "]" : "") \
3418 ); \
265c4333 3419 });
5bc10b2c 3420
40a82448 3421
3dab1dad 3422#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3423
3df15adc 3424#ifdef DEBUGGING
5bc10b2c 3425
ab3bbdeb 3426STATIC void
f2ed9b32 3427S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3428 const char *start, const char *end, const char *blurb)
3429{
efd26800 3430 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3431
3432 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3433
ab3bbdeb
YO
3434 if (!PL_colorset)
3435 reginitcolors();
3436 {
3437 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3438 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3439
f2ed9b32 3440 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3441 start, end - start, 60);
3442
3443 PerlIO_printf(Perl_debug_log,
3444 "%s%s REx%s %s against %s\n",
3445 PL_colors[4], blurb, PL_colors[5], s0, s1);
3446
f2ed9b32 3447 if (utf8_target||utf8_pat)
1de06328
YO
3448 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3449 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3450 utf8_pat && utf8_target ? " and " : "",
3451 utf8_target ? "string" : ""
ab3bbdeb
YO
3452 );
3453 }
3454}
3df15adc
YO
3455
3456STATIC void
786e8c11
YO
3457S_dump_exec_pos(pTHX_ const char *locinput,
3458 const regnode *scan,
3459 const char *loc_regeol,
3460 const char *loc_bostr,
3461 const char *loc_reg_starttry,
f2ed9b32 3462 const bool utf8_target)
07be1b83 3463{
786e8c11 3464 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3465 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3466 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3467 /* The part of the string before starttry has one color
3468 (pref0_len chars), between starttry and current
3469 position another one (pref_len - pref0_len chars),
3470 after the current position the third one.
3471 We assume that pref0_len <= pref_len, otherwise we
3472 decrease pref0_len. */
786e8c11
YO
3473 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3474 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3475 int pref0_len;
3476
7918f24d
NC
3477 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3478
f2ed9b32 3479 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3480 pref_len++;
786e8c11
YO
3481 pref0_len = pref_len - (locinput - loc_reg_starttry);
3482 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3483 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3484 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3485 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3486 l--;
3487 if (pref0_len < 0)
3488 pref0_len = 0;
3489 if (pref0_len > pref_len)
3490 pref0_len = pref_len;
3491 {
f2ed9b32 3492 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3493
ab3bbdeb 3494 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3495 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3496
ab3bbdeb 3497 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3498 (locinput - pref_len + pref0_len),
1de06328 3499 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3500
ab3bbdeb 3501 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3502 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3503
1de06328 3504 const STRLEN tlen=len0+len1+len2;
3df15adc 3505 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3506 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3507 (IV)(locinput - loc_bostr),
07be1b83 3508 len0, s0,
07be1b83 3509 len1, s1,
07be1b83 3510 (docolor ? "" : "> <"),
07be1b83 3511 len2, s2,
f9f4320a 3512 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3513 "");
3514 }
3515}
3df15adc 3516
07be1b83
YO
3517#endif
3518
0a4db386
YO
3519/* reg_check_named_buff_matched()
3520 * Checks to see if a named buffer has matched. The data array of
3521 * buffer numbers corresponding to the buffer is expected to reside
3522 * in the regexp->data->data array in the slot stored in the ARG() of
3523 * node involved. Note that this routine doesn't actually care about the
3524 * name, that information is not preserved from compilation to execution.
3525 * Returns the index of the leftmost defined buffer with the given name
3526 * or 0 if non of the buffers matched.
3527 */
3528STATIC I32
7918f24d
NC
3529S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3530{
0a4db386 3531 I32 n;
f8fc2ecf 3532 RXi_GET_DECL(rex,rexi);
ad64d0ec 3533 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3534 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3535
3536 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3537
0a4db386 3538 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3539 if ((I32)rex->lastparen >= nums[n] &&
3540 rex->offs[nums[n]].end != -1)
0a4db386
YO
3541 {
3542 return nums[n];
3543 }
3544 }
3545 return 0;
3546}
3547
2f554ef7 3548
c74f6de9 3549static bool
984e6dd1 3550S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3551 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3552{
79a2a0e8
KW
3553 /* This function determines if there are one or two characters that match
3554 * the first character of the passed-in EXACTish node <text_node>, and if
3555 * so, returns them in the passed-in pointers.
c74f6de9 3556 *
79a2a0e8
KW
3557 * If it determines that no possible character in the target string can
3558 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3559 * the first character in <text_node> requires UTF-8 to represent, and the
3560 * target string isn't in UTF-8.)
c74f6de9 3561 *
79a2a0e8
KW
3562 * If there are more than two characters that could match the beginning of
3563 * <text_node>, or if more context is required to determine a match or not,
3564 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3565 *
3566 * The motiviation behind this function is to allow the caller to set up
3567 * tight loops for matching. If <text_node> is of type EXACT, there is
3568 * only one possible character that can match its first character, and so
3569 * the situation is quite simple. But things get much more complicated if
3570 * folding is involved. It may be that the first character of an EXACTFish
3571 * node doesn't participate in any possible fold, e.g., punctuation, so it
3572 * can be matched only by itself. The vast majority of characters that are
3573 * in folds match just two things, their lower and upper-case equivalents.
3574 * But not all are like that; some have multiple possible matches, or match
3575 * sequences of more than one character. This function sorts all that out.
3576 *
3577 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3578 * loop of trying to match A*, we know we can't exit where the thing
3579 * following it isn't a B. And something can't be a B unless it is the
3580 * beginning of B. By putting a quick test for that beginning in a tight
3581 * loop, we can rule out things that can't possibly be B without having to
3582 * break out of the loop, thus avoiding work. Similarly, if A is a single
3583 * character, we can make a tight loop matching A*, using the outputs of
3584 * this function.
3585 *
3586 * If the target string to match isn't in UTF-8, and there aren't
3587 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3588 * the one or two possible octets (which are characters in this situation)
3589 * that can match. In all cases, if there is only one character that can
3590 * match, *<c1p> and *<c2p> will be identical.
3591 *
3592 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3593 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3594 * can match the beginning of <text_node>. They should be declared with at
3595 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3596 * undefined what these contain.) If one or both of the buffers are
3597 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3598 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3599 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3600 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3601 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9 3602
ba44c216 3603 const bool utf8_target = reginfo->is_utf8_target;
79a2a0e8 3604
ddb0d839
KW
3605 UV c1 = CHRTEST_NOT_A_CP_1;
3606 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8 3607 bool use_chrtest_void = FALSE;
aed7b151 3608 const bool is_utf8_pat = reginfo->is_utf8_pat;
79a2a0e8
KW
3609
3610 /* Used when we have both utf8 input and utf8 output, to avoid converting
3611 * to/from code points */
3612 bool utf8_has_been_setup = FALSE;
3613
c74f6de9
KW
3614 dVAR;
3615
b4291290 3616 U8 *pat = (U8*)STRING(text_node);
a6715020 3617 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
c74f6de9 3618
79a2a0e8
KW
3619 if (OP(text_node) == EXACT) {
3620
3621 /* In an exact node, only one thing can be matched, that first
3622 * character. If both the pat and the target are UTF-8, we can just
3623 * copy the input to the output, avoiding finding the code point of
3624 * that character */
984e6dd1 3625 if (!is_utf8_pat) {
79a2a0e8
KW
3626 c2 = c1 = *pat;
3627 }
3628 else if (utf8_target) {
3629 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3630 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3631 utf8_has_been_setup = TRUE;
3632 }
3633 else {
3634 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3635 }
79a2a0e8 3636 }
31f05a37
KW
3637 else { /* an EXACTFish node */
3638 U8 *pat_end = pat + STR_LEN(text_node);
3639
3640 /* An EXACTFL node has at least some characters unfolded, because what
3641 * they match is not known until now. So, now is the time to fold
3642 * the first few of them, as many as are needed to determine 'c1' and
3643 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
3644 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3645 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
3646 * need to fold as many characters as a single character can fold to,
3647 * so that later we can check if the first ones are such a multi-char
3648 * fold. But, in such a pattern only locale-problematic characters
3649 * aren't folded, so we can skip this completely if the first character
3650 * in the node isn't one of the tricky ones */
3651 if (OP(text_node) == EXACTFL) {
3652
3653 if (! is_utf8_pat) {
3654 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3655 {
3656 folded[0] = folded[1] = 's';
3657 pat = folded;
3658 pat_end = folded + 2;
3659 }
3660 }
3661 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3662 U8 *s = pat;
3663 U8 *d = folded;
3664 int i;
3665
3666 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3667 if (isASCII(*s)) {
3668 *(d++) = (U8) toFOLD_LC(*s);
3669 s++;
3670 }
3671 else {
3672 STRLEN len;
3673 _to_utf8_fold_flags(s,
3674 d,
3675 &len,
3676 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3677 d += len;
3678 s += UTF8SKIP(s);
3679 }
3680 }
3681
3682 pat = folded;
3683 pat_end = d;
3684 }
3685 }
3686
251b239f
KW
3687 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3688 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
baa60164
KW
3689 {
3690 /* Multi-character folds require more context to sort out. Also
3691 * PL_utf8_foldclosures used below doesn't handle them, so have to
3692 * be handled outside this routine */
3693 use_chrtest_void = TRUE;
3694 }
3695 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3696 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
e8ea8356 3697 if (c1 > 255) {
baa60164
KW
3698 /* Load the folds hash, if not already done */
3699 SV** listp;
3700 if (! PL_utf8_foldclosures) {
31667e6b 3701 _load_PL_utf8_foldclosures();
79a2a0e8 3702 }
79a2a0e8 3703
baa60164
KW
3704 /* The fold closures data structure is a hash with the keys
3705 * being the UTF-8 of every character that is folded to, like
3706 * 'k', and the values each an array of all code points that
3707 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
3708 * Multi-character folds are not included */
3709 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3710 (char *) pat,
3711 UTF8SKIP(pat),
3712 FALSE))))
3713 {
3714 /* Not found in the hash, therefore there are no folds
3715 * containing it, so there is only a single character that
3716 * could match */
3717 c2 = c1;
79a2a0e8 3718 }
baa60164
KW
3719 else { /* Does participate in folds */
3720 AV* list = (AV*) *listp;
b9f2b683 3721 if (av_tindex(list) != 1) {
79a2a0e8 3722
baa60164
KW
3723 /* If there aren't exactly two folds to this, it is
3724 * outside the scope of this function */
3725 use_chrtest_void = TRUE;
79a2a0e8 3726 }
baa60164
KW
3727 else { /* There are two. Get them */
3728 SV** c_p = av_fetch(list, 0, FALSE);
3729 if (c_p == NULL) {
3730 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3731 }
3732 c1 = SvUV(*c_p);
3733
3734 c_p = av_fetch(list, 1, FALSE);
3735 if (c_p == NULL) {
3736 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3737 }
3738 c2 = SvUV(*c_p);
3739
3740 /* Folds that cross the 255/256 boundary are forbidden
3741 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3742 * one is ASCIII. Since the pattern character is above
e8ea8356 3743 * 255, and its only other match is below 256, the only
baa60164
KW
3744 * legal match will be to itself. We have thrown away
3745 * the original, so have to compute which is the one
e8ea8356 3746 * above 255. */
baa60164
KW
3747 if ((c1 < 256) != (c2 < 256)) {
3748 if ((OP(text_node) == EXACTFL
3749 && ! IN_UTF8_CTYPE_LOCALE)
3750 || ((OP(text_node) == EXACTFA
3751 || OP(text_node) == EXACTFA_NO_TRIE)
3752 && (isASCII(c1) || isASCII(c2))))
3753 {
3754 if (c1 < 256) {
3755 c1 = c2;
3756 }
3757 else {
3758 c2 = c1;
3759 }
79a2a0e8
KW
3760 }
3761 }
3762 }
3763 }
3764 }
e8ea8356 3765 else /* Here, c1 is <= 255 */
baa60164
KW
3766 if (utf8_target
3767 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3768 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3769 && ((OP(text_node) != EXACTFA
3770 && OP(text_node) != EXACTFA_NO_TRIE)
3771 || ! isASCII(c1)))
3772 {
3773 /* Here, there could be something above Latin1 in the target
3774 * which folds to this character in the pattern. All such
3775 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3776 * than two characters involved in their folds, so are outside
3777 * the scope of this function */
3778 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3779 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3780 }
3781 else {
3782 use_chrtest_void = TRUE;
3783 }
79a2a0e8 3784 }
baa60164
KW
3785 else { /* Here nothing above Latin1 can fold to the pattern
3786 character */
3787 switch (OP(text_node)) {
c74f6de9 3788
baa60164
KW
3789 case EXACTFL: /* /l rules */
3790 c2 = PL_fold_locale[c1];
3791 break;
c74f6de9 3792
baa60164
KW
3793 case EXACTF: /* This node only generated for non-utf8
3794 patterns */
3795 assert(! is_utf8_pat);
3796 if (! utf8_target) { /* /d rules */
3797 c2 = PL_fold[c1];
3798 break;
3799 }
3800 /* FALLTHROUGH */
3801 /* /u rules for all these. This happens to work for
3802 * EXACTFA as nothing in Latin1 folds to ASCII */
3803 case EXACTFA_NO_TRIE: /* This node only generated for
3804 non-utf8 patterns */
3805 assert(! is_utf8_pat);
924ba076 3806 /* FALLTHROUGH */
baa60164
KW
3807 case EXACTFA:
3808 case EXACTFU_SS:
3809 case EXACTFU:
3810 c2 = PL_fold_latin1[c1];
c74f6de9 3811 break;
c74f6de9 3812
baa60164
KW
3813 default:
3814 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3815 assert(0); /* NOTREACHED */
3816 }
c74f6de9
KW
3817 }
3818 }
3819 }
79a2a0e8
KW
3820
3821 /* Here have figured things out. Set up the returns */
3822 if (use_chrtest_void) {
3823 *c2p = *c1p = CHRTEST_VOID;
3824 }
3825 else if (utf8_target) {
3826 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3827 uvchr_to_utf8(c1_utf8, c1);
3828 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3829 }
c74f6de9 3830
79a2a0e8
KW
3831 /* Invariants are stored in both the utf8 and byte outputs; Use
3832 * negative numbers otherwise for the byte ones. Make sure that the
3833 * byte ones are the same iff the utf8 ones are the same */
3834 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3835 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3836 ? *c2_utf8
3837 : (c1 == c2)
3838 ? CHRTEST_NOT_A_CP_1
3839 : CHRTEST_NOT_A_CP_2;
3840 }
3841 else if (c1 > 255) {
3842 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3843 can represent */
3844 return FALSE;
3845 }
c74f6de9 3846
79a2a0e8
KW
3847 *c1p = *c2p = c2; /* c2 is the only representable value */
3848 }
3849 else { /* c1 is representable; see about c2 */
3850 *c1p = c1;
3851 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3852 }
2f554ef7 3853
c74f6de9
KW
3854 return TRUE;
3855}
2f554ef7 3856
f73aaa43 3857/* returns -1 on failure, $+[0] on success */
99a90e59 3858STATIC SSize_t
f73aaa43 3859S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3860{
a35a87e7 3861#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3862 dMY_CXT;
3863#endif
27da23d5 3864 dVAR;
ba44c216 3865 const bool utf8_target = reginfo->is_utf8_target;
4ad0818d 3866 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3867 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3868 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3869 RXi_GET_DECL(rex,rexi);
5d9a96ca 3870 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3871 regmatch_state *st;
5d9a96ca 3872 /* cache heavy used fields of st in registers */
eb578fdb
KW
3873 regnode *scan;
3874 regnode *next;
3875 U32 n = 0; /* general value; init to avoid compiler warning */
ea3daa5d 3876 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3877 char *locinput = startpos;
4d5016e5 3878 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3879 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3880
b69b0499 3881 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3882 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3883 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3884 const U32 max_nochange_depth =
3885 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3886 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3887 regmatch_state *yes_state = NULL; /* state to pop to on success of
3888 subpattern */
e2e6a0f1
YO
3889 /* mark_state piggy backs on the yes_state logic so that when we unwind
3890 the stack on success we can update the mark_state as we go */
3891 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3892 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3893 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3894 U32 state_num;
5d458dd8
YO
3895 bool no_final = 0; /* prevent failure from backtracking? */
3896 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3897 char *startpoint = locinput;
5d458dd8
YO
3898 SV *popmark = NULL; /* are we looking for a mark? */
3899 SV *sv_commit = NULL; /* last mark name seen in failure */
3900 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3901 during a successful match */
5d458dd8
YO
3902 U32 lastopen = 0; /* last open we saw */
3903 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4a2b275c 3904 SV* const oreplsv = GvSVn(PL_replgv);
24d3c4a9
DM
3905 /* these three flags are set by various ops to signal information to
3906 * the very next op. They have a useful lifetime of exactly one loop
3907 * iteration, and are not preserved or restored by state pushes/pops
3908 */
3909 bool sw = 0; /* the condition value in (?(cond)a|b) */
3910 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3911 int logical = 0; /* the following EVAL is:
3912 0: (?{...})
3913 1: (?(?{...})X|Y)
3914 2: (??{...})
3915 or the following IFMATCH/UNLESSM is:
3916 false: plain (?=foo)
3917 true: used as a condition: (?(?=foo))
3918 */
81ed78b2
DM
3919 PAD* last_pad = NULL;
3920 dMULTICALL;
3921 I32 gimme = G_SCALAR;
3922 CV *caller_cv = NULL; /* who called us */
3923 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3924 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3925 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
3926 int to_complement; /* Invert the result? */
3927 _char_class_number classnum;
984e6dd1 3928 bool is_utf8_pat = reginfo->is_utf8_pat;
81ed78b2 3929
95b24440 3930#ifdef DEBUGGING
e68ec53f 3931 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
3932#endif
3933
7e68f152
FC
3934 /* protect against undef(*^R) */
3935 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
3936
81ed78b2
DM
3937 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3938 multicall_oldcatch = 0;
3939 multicall_cv = NULL;
3940 cx = NULL;
4f8dbb2d
JL
3941 PERL_UNUSED_VAR(multicall_cop);
3942 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
3943
3944
7918f24d
NC
3945 PERL_ARGS_ASSERT_REGMATCH;
3946
3b57cd43 3947 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 3948 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 3949 }));
5d9a96ca 3950
331b2dcc 3951 st = PL_regmatch_state;
5d9a96ca 3952
d6a28714 3953 /* Note that nextchr is a byte even in UTF */
7016d6eb 3954 SET_nextchr;
d6a28714
JH
3955 scan = prog;
3956 while (scan != NULL) {
8ba1375e 3957
a3621e74 3958 DEBUG_EXECUTE_r( {
6136c704 3959 SV * const prop = sv_newmortal();
1de06328 3960 regnode *rnext=regnext(scan);
f2ed9b32 3961 DUMP_EXEC_POS( locinput, scan, utf8_target );
2395827c 3962 regprop(rex, prop, scan, reginfo);
07be1b83
YO
3963
3964 PerlIO_printf(Perl_debug_log,
3965 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 3966 (IV)(scan - rexi->program), depth*2, "",
07be1b83 3967 SvPVX_const(prop),
1de06328 3968 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 3969 0 : (IV)(rnext - rexi->program));
2a782b5b 3970 });
d6a28714
JH
3971
3972 next = scan + NEXT_OFF(scan);
3973 if (next == scan)
3974 next = NULL;
40a82448 3975 state_num = OP(scan);
d6a28714 3976
40a82448 3977 reenter_switch:
3018b823 3978 to_complement = 0;
34a81e2b 3979
7016d6eb 3980 SET_nextchr;
e6ca698c 3981 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 3982
40a82448 3983 switch (state_num) {
b838857b
DM
3984 case BOL: /* /^../ */
3985 case SBOL: /* /^../s */
9d9163fb 3986 if (locinput == reginfo->strbeg)
b8c5462f 3987 break;
d6a28714 3988 sayNO;
3c0563b9
DM
3989
3990 case MBOL: /* /^../m */
9d9163fb 3991 if (locinput == reginfo->strbeg ||
7016d6eb 3992 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 3993 {
b8c5462f
JH
3994 break;
3995 }
d6a28714 3996 sayNO;
3c0563b9 3997
3c0563b9 3998 case GPOS: /* \G */
3b0527fe 3999 if (locinput == reginfo->ganch)
d6a28714
JH
4000 break;
4001 sayNO;
ee9b8eae 4002
3c0563b9 4003 case KEEPS: /* \K */
ee9b8eae 4004 /* update the startpoint */
b93070ed 4005 st->u.keeper.val = rex->offs[0].start;
9d9163fb 4006 rex->offs[0].start = locinput - reginfo->strbeg;
4d5016e5 4007 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
878531d3 4008 assert(0); /*NOTREACHED*/
ee9b8eae
YO
4009 case KEEPS_next_fail:
4010 /* rollback the start point change */
b93070ed 4011 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 4012 sayNO_SILENT;
878531d3 4013 assert(0); /*NOTREACHED*/
3c0563b9 4014
3c0563b9 4015 case MEOL: /* /..$/m */
7016d6eb 4016 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 4017 sayNO;
b8c5462f 4018 break;
3c0563b9 4019
62e16618 4020 case EOL: /* /..$/ */
924ba076 4021 /* FALLTHROUGH */
3c0563b9 4022 case SEOL: /* /..$/s */
7016d6eb 4023 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 4024 sayNO;
220db18a 4025 if (reginfo->strend - locinput > 1)
b8c5462f 4026 sayNO;
b8c5462f 4027 break;
3c0563b9
DM
4028
4029 case EOS: /* \z */
7016d6eb 4030 if (!NEXTCHR_IS_EOS)
b8c5462f 4031 sayNO;
d6a28714 4032 break;
3c0563b9
DM
4033
4034 case SANY: /* /./s */
7016d6eb 4035 if (NEXTCHR_IS_EOS)
4633a7c4 4036 sayNO;
28b98f76 4037 goto increment_locinput;
3c0563b9
DM
4038
4039 case CANY: /* \C */
7016d6eb 4040 if (NEXTCHR_IS_EOS)
f33976b4 4041 sayNO;
3640db6b 4042 locinput++;
a0d0e21e 4043 break;
3c0563b9
DM
4044
4045 case REG_ANY: /* /./ */
7016d6eb 4046 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 4047 sayNO;
28b98f76
DM
4048 goto increment_locinput;
4049
166ba7cd
DM
4050
4051#undef ST
4052#define ST st->u.trie
3c0563b9 4053 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
4054 /* In this case the charclass data is available inline so
4055 we can fail fast without a lot of extra overhead.
4056 */
7016d6eb 4057 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
4058 DEBUG_EXECUTE_r(
4059 PerlIO_printf(Perl_debug_log,
4060 "%*s %sfailed to match trie start class...%s\n",
4061 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4062 );
4063 sayNO_SILENT;
118e2215 4064 assert(0); /* NOTREACHED */
786e8c11 4065 }
924ba076 4066 /* FALLTHROUGH */
3c0563b9 4067 case TRIE: /* (ab|cd) */
2e64971a
DM
4068 /* the basic plan of execution of the trie is:
4069 * At the beginning, run though all the states, and
4070 * find the longest-matching word. Also remember the position
4071 * of the shortest matching word. For example, this pattern:
4072 * 1 2 3 4 5
4073 * ab|a|x|abcd|abc
4074 * when matched against the string "abcde", will generate
4075 * accept states for all words except 3, with the longest
895cc420 4076 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
4077 * the position being after char 1 of the string).
4078 *
4079 * Then for each matching word, in word order (i.e. 1,2,4,5),
4080 * we run the remainder of the pattern; on each try setting
4081 * the current position to the character following the word,
4082 * returning to try the next word on failure.
4083 *
4084 * We avoid having to build a list of words at runtime by
4085 * using a compile-time structure, wordinfo[].prev, which
4086 * gives, for each word, the previous accepting word (if any).
4087 * In the case above it would contain the mappings 1->2, 2->0,
4088 * 3->0, 4->5, 5->1. We can use this table to generate, from
4089 * the longest word (4 above), a list of all words, by
4090 * following the list of prev pointers; this gives us the
4091 * unordered list 4,5,1,2. Then given the current word we have
4092 * just tried, we can go through the list and find the
4093 * next-biggest word to try (so if we just failed on word 2,
4094 * the next in the list is 4).
4095 *
4096 * Since at runtime we don't record the matching position in
4097 * the string for each word, we have to work that out for
4098 * each word we're about to process. The wordinfo table holds
4099 * the character length of each word; given that we recorded
4100 * at the start: the position of the shortest word and its
4101 * length in chars, we just need to move the pointer the
4102 * difference between the two char lengths. Depending on
4103 * Unicode status and folding, that's cheap or expensive.
4104 *
4105 * This algorithm is optimised for the case where are only a
4106 * small number of accept states, i.e. 0,1, or maybe 2.
4107 * With lots of accepts states, and having to try all of them,
4108 * it becomes quadratic on number of accept states to find all
4109 * the next words.
4110 */
4111
3dab1dad 4112 {
07be1b83 4113 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 4114 DECL_TRIE_TYPE(scan);
3dab1dad
YO
4115
4116 /* what trie are we using right now */
be8e71aa 4117 reg_trie_data * const trie
f8fc2ecf 4118 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 4119 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 4120 U32 state = trie->startstate;
166ba7cd 4121
7016d6eb
DM
4122 if ( trie->bitmap
4123 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4124 {
3dab1dad
YO
4125 if (trie->states[ state ].wordnum) {
4126 DEBUG_EXECUTE_r(
4127 PerlIO_printf(Perl_debug_log,
4128 "%*s %smatched empty string...%s\n",
5bc10b2c 4129 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 4130 );
20dbff7c
YO
4131 if (!trie->jump)
4132 break;
3dab1dad
YO
4133 } else {
4134 DEBUG_EXECUTE_r(
4135 PerlIO_printf(Perl_debug_log,
786e8c11 4136 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 4137 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
4138 );
4139 sayNO_SILENT;
4140 }
4141 }
166ba7cd 4142
786e8c11
YO
4143 {
4144 U8 *uc = ( U8* )locinput;
4145
4146 STRLEN len = 0;
4147 STRLEN foldlen = 0;
4148 U8 *uscan = (U8*)NULL;
786e8c11 4149 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
4150 U32 charcount = 0; /* how many input chars we have matched */
4151 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 4152
786e8c11 4153 ST.jump = trie->jump;
786e8c11 4154 ST.me = scan;
2e64971a
DM
4155 ST.firstpos = NULL;
4156 ST.longfold = FALSE; /* char longer if folded => it's harder */
4157 ST.nextword = 0;
4158
4159 /* fully traverse the TRIE; note the position of the
4160 shortest accept state and the wordnum of the longest
4161 accept state */
07be1b83 4162
220db18a 4163 while ( state && uc <= (U8*)(reginfo->strend) ) {
786e8c11 4164 U32 base = trie->states[ state ].trans.base;
f9f4320a 4165 UV uvc = 0;
acb909b4 4166 U16 charid = 0;
2e64971a
DM
4167 U16 wordnum;
4168 wordnum = trie->states[ state ].wordnum;
4169
4170 if (wordnum) { /* it's an accept state */
4171 if (!accepted) {
4172 accepted = 1;
4173 /* record first match position */
4174 if (ST.longfold) {
4175 ST.firstpos = (U8*)locinput;
4176 ST.firstchars = 0;
5b47454d 4177 }
2e64971a
DM
4178 else {
4179 ST.firstpos = uc;
4180 ST.firstchars = charcount;
4181 }
4182 }
4183 if (!ST.nextword || wordnum < ST.nextword)
4184 ST.nextword = wordnum;
4185 ST.topword = wordnum;
786e8c11 4186 }
a3621e74 4187
07be1b83 4188 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 4189 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 4190 PerlIO_printf( Perl_debug_log,
2e64971a 4191 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 4192 2+depth * 2, "", PL_colors[4],
2e64971a 4193 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 4194 });
a3621e74 4195
2e64971a 4196 /* read a char and goto next state */
220db18a 4197 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6dd2be57 4198 I32 offset;
55eed653
NC
4199 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4200 uscan, len, uvc, charid, foldlen,
4201 foldbuf, uniflags);
2e64971a
DM
4202 charcount++;
4203 if (foldlen>0)
4204 ST.longfold = TRUE;
5b47454d 4205 if (charid &&
6dd2be57
DM
4206 ( ((offset =
4207 base + charid - 1 - trie->uniquecharcount)) >= 0)
4208
4209 && ((U32)offset < trie->lasttrans)
4210 && trie->trans[offset].check == state)
5b47454d 4211 {
6dd2be57 4212 state = trie->trans[offset].next;
5b47454d
DM
4213 }
4214 else {
4215 state = 0;
4216 }
4217 uc += len;
4218
4219 }
4220 else {
a3621e74
YO
4221 state = 0;
4222 }
4223 DEBUG_TRIE_EXECUTE_r(
e4584336 4224 PerlIO_printf( Perl_debug_log,
786e8c11 4225 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 4226 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
4227 );
4228 }
2e64971a 4229 if (!accepted)
a3621e74 4230 sayNO;
a3621e74 4231
2e64971a
DM
4232 /* calculate total number of accept states */
4233 {
4234 U16 w = ST.topword;
4235 accepted = 0;
4236 while (w) {
4237 w = trie->wordinfo[w].prev;
4238 accepted++;
4239 }
4240 ST.accepted = accepted;
4241 }
4242
166ba7cd
DM
4243 DEBUG_EXECUTE_r(
4244 PerlIO_printf( Perl_debug_log,
4245 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 4246 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
4247 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4248 );
2e64971a 4249 goto trie_first_try; /* jump into the fail handler */
786e8c11 4250 }}
118e2215 4251 assert(0); /* NOTREACHED */
2e64971a
DM
4252
4253 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
4254 {
4255 U8 *uc;
fae667d5
YO
4256 if ( ST.jump) {
4257 REGCP_UNWIND(ST.cp);
a8d1f4b4 4258 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 4259 }
2e64971a
DM
4260 if (!--ST.accepted) {
4261 DEBUG_EXECUTE_r({
4262 PerlIO_printf( Perl_debug_log,
4263 "%*s %sTRIE failed...%s\n",
4264 REPORT_CODE_OFF+depth*2, "",
4265 PL_colors[4],
4266 PL_colors[5] );
4267 });
4268 sayNO_SILENT;
4269 }
4270 {
4271 /* Find next-highest word to process. Note that this code
4272 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
4273 U16 min = 0;
4274 U16 word;
4275 U16 const nextword = ST.nextword;
4276 reg_trie_wordinfo * const wordinfo
2e64971a
DM
4277 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4278 for (word=ST.topword; word; word=wordinfo[word].prev) {
4279 if (word > nextword && (!min || word < min))
4280 min = word;
4281 }
4282 ST.nextword = min;
4283 }
4284
fae667d5 4285 trie_first_try:
5d458dd8
YO
4286 if (do_cutgroup) {
4287 do_cutgroup = 0;
4288 no_final = 0;
4289 }
fae667d5
YO
4290
4291 if ( ST.jump) {
b93070ed 4292 ST.lastparen = rex->lastparen;
f6033a9d 4293 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 4294 REGCP_SET(ST.cp);
2e64971a 4295 }
a3621e74 4296
2e64971a 4297 /* find start char of end of current word */
166ba7cd 4298 {
2e64971a 4299 U32 chars; /* how many chars to skip */
2e64971a
DM
4300 reg_trie_data * const trie
4301 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4302
4303 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4304 >= ST.firstchars);
4305 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4306 - ST.firstchars;
a059a757 4307 uc = ST.firstpos;
2e64971a
DM
4308
4309 if (ST.longfold) {
4310 /* the hard option - fold each char in turn and find
4311 * its folded length (which may be different */
4312 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4313 STRLEN foldlen;
4314 STRLEN len;
d9a396a3 4315 UV uvc;
2e64971a
DM
4316 U8 *uscan;
4317
4318 while (chars) {
f2ed9b32 4319 if (utf8_target) {
c80e42f3 4320 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
2e64971a
DM
4321 uniflags);
4322 uc += len;
4323 }
4324 else {
4325 uvc = *uc;
4326 uc++;
4327 }
4328 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4329 uscan = foldbuf;
4330 while (foldlen) {
4331 if (!--chars)
4332 break;
c80e42f3 4333 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
2e64971a
DM
4334 uniflags);
4335 uscan += len;
4336 foldlen -= len;
4337 }
4338 }
a3621e74 4339 }
2e64971a 4340 else {
f2ed9b32 4341 if (utf8_target)
2e64971a
DM
4342 while (chars--)
4343 uc += UTF8SKIP(uc);
4344 else
4345 uc += chars;
4346 }
2e64971a 4347 }
166ba7cd 4348
6603fe3e
DM
4349 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4350 ? ST.jump[ST.nextword]
4351 : NEXT_OFF(ST.me));
166ba7cd 4352
2e64971a
DM
4353 DEBUG_EXECUTE_r({
4354 PerlIO_printf( Perl_debug_log,
4355 "%*s %sTRIE matched word #%d, continuing%s\n",
4356 REPORT_CODE_OFF+depth*2, "",
4357 PL_colors[4],
4358 ST.nextword,
4359 PL_colors[5]
4360 );
4361 });
4362
4363 if (ST.accepted > 1 || has_cutgroup) {
a059a757 4364 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
118e2215 4365 assert(0); /* NOTREACHED */
166ba7cd 4366 }
2e64971a
DM
4367 /* only one choice left - just continue */
4368 DEBUG_EXECUTE_r({
4369 AV *const trie_words
4370 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4371 SV ** const tmp = av_fetch( trie_words,
4372 ST.nextword-1, 0 );
4373 SV *sv= tmp ? sv_newmortal() : NULL;
4374
4375 PerlIO_printf( Perl_debug_log,
4376 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4377 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4378 ST.nextword,
4379 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4380 PL_colors[0], PL_colors[1],
c89df6cf 4381 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
4382 )
4383 : "not compiled under -Dr",
4384 PL_colors[5] );
4385 });
4386
a059a757 4387 locinput = (char*)uc;
2e64971a 4388 continue; /* execute rest of RE */
118e2215 4389 assert(0); /* NOTREACHED */
a059a757 4390 }
166ba7cd
DM
4391#undef ST
4392
3c0563b9 4393 case EXACT: { /* /abc/ */
95b24440 4394 char *s = STRING(scan);
24d3c4a9 4395 ln = STR_LEN(scan);
984e6dd1 4396 if (utf8_target != is_utf8_pat) {
bc517b45 4397 /* The target and the pattern have differing utf8ness. */
1aa99e6b 4398 char *l = locinput;
24d3c4a9 4399 const char * const e = s + ln;
a72c7584 4400
f2ed9b32 4401 if (utf8_target) {
e6a3850e
KW
4402 /* The target is utf8, the pattern is not utf8.
4403 * Above-Latin1 code points can't match the pattern;
4404 * invariants match exactly, and the other Latin1 ones need
4405 * to be downgraded to a single byte in order to do the
4406 * comparison. (If we could be confident that the target
4407 * is not malformed, this could be refactored to have fewer
4408 * tests by just assuming that if the first bytes match, it
4409 * is an invariant, but there are tests in the test suite
4410 * dealing with (??{...}) which violate this) */
1aa99e6b 4411 while (s < e) {
220db18a
DM
4412 if (l >= reginfo->strend
4413 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4414 {
e6a3850e
KW
4415 sayNO;
4416 }
4417 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4418 if (*l != *s) {
4419 sayNO;
4420 }
4421 l++;
4422 }
4423 else {
94bb8c36
KW
4424 if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4425 {
e6a3850e
KW
4426 sayNO;
4427 }
4428 l += 2;
4429 }
4430 s++;
1aa99e6b 4431 }
5ff6fc6d
JH
4432 }
4433 else {
4434 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 4435 while (s < e) {
220db18a
DM
4436 if (l >= reginfo->strend
4437 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
e6a3850e
KW
4438 {
4439 sayNO;
4440 }
4441 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4442 if (*s != *l) {
4443 sayNO;
4444 }
4445 s++;
4446 }
4447 else {
94bb8c36
KW
4448 if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4449 {
e6a3850e
KW
4450 sayNO;
4451 }
4452 s += 2;
4453 }
4454 l++;
1aa99e6b 4455 }
5ff6fc6d 4456 }
1aa99e6b 4457 locinput = l;
1aa99e6b 4458 }
5ac65bff
KW
4459 else {
4460 /* The target and the pattern have the same utf8ness. */
4461 /* Inline the first character, for speed. */
220db18a 4462 if (reginfo->strend - locinput < ln
5ac65bff
KW
4463 || UCHARAT(s) != nextchr
4464 || (ln > 1 && memNE(s, locinput, ln)))
4465 {
4466 sayNO;
4467 }
4468 locinput += ln;
4469 }
d6a28714 4470 break;
95b24440 4471 }
7016d6eb 4472
3c0563b9 4473 case EXACTFL: { /* /abc/il */
a932d541 4474 re_fold_t folder;
9a5a5549
KW
4475 const U8 * fold_array;
4476 const char * s;
d513472c 4477 U32 fold_utf8_flags;
9a5a5549 4478
f67f9e53
KW
4479 folder = foldEQ_locale;
4480 fold_array = PL_fold_locale;
cea315b6 4481 fold_utf8_flags = FOLDEQ_LOCALE;
9a5a5549
KW
4482 goto do_exactf;
4483
3c0563b9 4484 case EXACTFU_SS: /* /\x{df}/iu */
3c0563b9 4485 case EXACTFU: /* /abc/iu */
9a5a5549
KW
4486 folder = foldEQ_latin1;
4487 fold_array = PL_fold_latin1;
984e6dd1 4488 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
4489 goto do_exactf;
4490
098b07d5
KW
4491 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
4492 patterns */
4493 assert(! is_utf8_pat);
924ba076 4494 /* FALLTHROUGH */
3c0563b9 4495 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
4496 folder = foldEQ_latin1;
4497 fold_array = PL_fold_latin1;
57014d77 4498 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
4499 goto do_exactf;
4500
2fdb7295
KW
4501 case EXACTF: /* /abc/i This node only generated for
4502 non-utf8 patterns */
4503 assert(! is_utf8_pat);
9a5a5549
KW
4504 folder = foldEQ;
4505 fold_array = PL_fold;
62bf7766 4506 fold_utf8_flags = 0;
9a5a5549
KW
4507
4508 do_exactf:
4509 s = STRING(scan);
24d3c4a9 4510 ln = STR_LEN(scan);
d6a28714 4511
31f05a37
KW
4512 if (utf8_target
4513 || is_utf8_pat
4514 || state_num == EXACTFU_SS
4515 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4516 {
3c760661
KW
4517 /* Either target or the pattern are utf8, or has the issue where
4518 * the fold lengths may differ. */
be8e71aa 4519 const char * const l = locinput;
220db18a 4520 char *e = reginfo->strend;
bc517b45 4521
984e6dd1 4522 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 4523 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
4524 {
4525 sayNO;
5486206c 4526 }
d07ddd77 4527 locinput = e;
d07ddd77 4528 break;
a0ed51b3 4529 }
d6a28714 4530
0a138b74 4531 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
4532 if (UCHARAT(s) != nextchr
4533 && !NEXTCHR_IS_EOS
4534 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 4535 {
a0ed51b3 4536 sayNO;
9a5a5549 4537 }
220db18a 4538 if (reginfo->strend - locinput < ln)
b8c5462f 4539 sayNO;
9a5a5549 4540 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 4541 sayNO;
24d3c4a9 4542 locinput += ln;
a0d0e21e 4543 break;
9a5a5549 4544 }
63ac0dad
KW
4545
4546 /* XXX Could improve efficiency by separating these all out using a
4547 * macro or in-line function. At that point regcomp.c would no longer
4548 * have to set the FLAGS fields of these */
3c0563b9
DM
4549 case BOUNDL: /* /\b/l */
4550 case NBOUNDL: /* /\B/l */
3c0563b9
DM
4551 case BOUND: /* /\b/ */
4552 case BOUNDU: /* /\b/u */
4553 case BOUNDA: /* /\b/a */
4554 case NBOUND: /* /\B/ */
4555 case NBOUNDU: /* /\B/u */
4556 case NBOUNDA: /* /\B/a */
b2680017 4557 /* was last char in word? */
f2e96b5d
KW
4558 if (utf8_target
4559 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4560 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4561 {
9d9163fb 4562 if (locinput == reginfo->strbeg)
b2680017
YO
4563 ln = '\n';
4564 else {
9d9163fb
DM
4565 const U8 * const r =
4566 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
b2680017 4567
3db24e1e
KW
4568 ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4569 0, uniflags);
b2680017 4570 }
63ac0dad 4571 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
0eb30aeb 4572 ln = isWORDCHAR_uni(ln);
7016d6eb
DM
4573 if (NEXTCHR_IS_EOS)
4574 n = 0;
4575 else {
4576 LOAD_UTF8_CHARCLASS_ALNUM();
03940dc2 4577 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
7016d6eb
DM
4578 utf8_target);
4579 }
b2680017
YO
4580 }
4581 else {
f4cd282c 4582 ln = isWORDCHAR_LC_uvchr(ln);
0eb30aeb 4583 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
b2680017
YO
4584 }
4585 }
4586 else {
cfaf538b
KW
4587
4588 /* Here the string isn't utf8, or is utf8 and only ascii
4589 * characters are to match \w. In the latter case looking at
4590 * the byte just prior to the current one may be just the final
4591 * byte of a multi-byte character. This is ok. There are two
4592 * cases:
4593 * 1) it is a single byte character, and then the test is doing
4594 * just what it's supposed to.
4595 * 2) it is a multi-byte character, in which case the final
4596 * byte is never mistakable for ASCII, and so the test
4597 * will say it is not a word character, which is the
4598 * correct answer. */
9d9163fb 4599 ln = (locinput != reginfo->strbeg) ?
b2680017 4600 UCHARAT(locinput - 1) : '\n';
63ac0dad
KW
4601 switch (FLAGS(scan)) {
4602 case REGEX_UNICODE_CHARSET:
4603 ln = isWORDCHAR_L1(ln);
7016d6eb 4604 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
63ac0dad
KW
4605 break;
4606 case REGEX_LOCALE_CHARSET:
0eb30aeb
KW
4607 ln = isWORDCHAR_LC(ln);
4608 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
63ac0dad
KW
4609 break;
4610 case REGEX_DEPENDS_CHARSET:
0eb30aeb
KW
4611 ln = isWORDCHAR(ln);
4612 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
63ac0dad 4613 break;
cfaf538b 4614 case REGEX_ASCII_RESTRICTED_CHARSET:
c973bd4f 4615 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b 4616 ln = isWORDCHAR_A(ln);
7016d6eb 4617 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
cfaf538b 4618 break;
63ac0dad
KW
4619 default:
4620 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4621 break;
b2680017
YO
4622 }
4623 }
63ac0dad
KW
4624 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4625 * regcomp.sym */
4626 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
b2680017
YO
4627 sayNO;
4628 break;
3c0563b9 4629
3c0563b9 4630 case ANYOF: /* /[abc]/ */
7016d6eb
DM
4631 if (NEXTCHR_IS_EOS)
4632 sayNO;
e0193e47 4633 if (utf8_target) {
3db24e1e
KW
4634 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4635 utf8_target))
09b08e9b 4636 sayNO;
635cd5d4 4637 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
4638 }
4639 else {
20ed0b26 4640 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 4641 sayNO;
3640db6b 4642 locinput++;
e0f9d4a8 4643 }
b8c5462f 4644 break;
3c0563b9 4645
3018b823
KW
4646 /* The argument (FLAGS) to all the POSIX node types is the class number
4647 * */
ee9a90b8 4648
3018b823
KW
4649 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4650 to_complement = 1;
4651 /* FALLTHROUGH */
4652
4653 case POSIXL: /* \w or [:punct:] etc. under /l */
4654 if (NEXTCHR_IS_EOS)
bedac28b 4655 sayNO;
bedac28b 4656
3018b823
KW
4657 /* Use isFOO_lc() for characters within Latin1. (Note that
4658 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4659 * wouldn't be invariant) */
4660 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 4661 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
4662 sayNO;
4663 }
4664 }
3018b823
KW
4665 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4666 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
94bb8c36 4667 (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
3018b823
KW
4668 *(locinput + 1))))))
4669 {
bedac28b 4670 sayNO;
3018b823 4671 }
bedac28b 4672 }
3018b823
KW
4673 else { /* Here, must be an above Latin-1 code point */
4674 goto utf8_posix_not_eos;
bedac28b 4675 }
3018b823
KW
4676
4677 /* Here, must be utf8 */
4678 locinput += UTF8SKIP(locinput);
bedac28b
KW
4679 break;
4680
3018b823
KW
4681 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4682 to_complement = 1;
4683 /* FALLTHROUGH */
4684
4685 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 4686 if (utf8_target) {
3018b823 4687 goto utf8_posix;
bedac28b 4688 }
3018b823
KW
4689 goto posixa;
4690
4691 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 4692
3018b823 4693 if (NEXTCHR_IS_EOS) {
bedac28b
KW
4694 sayNO;
4695 }
bedac28b 4696
3018b823
KW
4697 /* All UTF-8 variants match */
4698 if (! UTF8_IS_INVARIANT(nextchr)) {
4699 goto increment_locinput;
bedac28b 4700 }
ee9a90b8 4701
3018b823
KW
4702 to_complement = 1;
4703 /* FALLTHROUGH */
4704
4705 case POSIXA: /* \w or [:punct:] etc. under /a */
4706
4707 posixa:
4708 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4709 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4710 * character is a single byte */
20d0b1e9 4711
3018b823
KW
4712 if (NEXTCHR_IS_EOS
4713 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4714 FLAGS(scan)))))
4715 {
0658cdde
KW
4716 sayNO;
4717 }
3018b823
KW
4718
4719 /* Here we are either not in utf8, or we matched a utf8-invariant,
4720 * so the next char is the next byte */
3640db6b 4721 locinput++;
0658cdde 4722 break;
3c0563b9 4723
3018b823
KW
4724 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4725 to_complement = 1;
4726 /* FALLTHROUGH */
4727
4728 case POSIXU: /* \w or [:punct:] etc. under /u */
4729 utf8_posix:
4730 if (NEXTCHR_IS_EOS) {
0658cdde
KW
4731 sayNO;
4732 }
3018b823
KW
4733 utf8_posix_not_eos:
4734
4735 /* Use _generic_isCC() for characters within Latin1. (Note that
4736 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4737 * wouldn't be invariant) */
4738 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4739 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4740 FLAGS(scan)))))
4741 {
4742 sayNO;
4743 }
4744 locinput++;
4745 }
4746 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4747 if (! (to_complement
94bb8c36 4748 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
3018b823 4749 *(locinput + 1)),
94bb8c36 4750 FLAGS(scan)))))
3018b823
KW
4751 {
4752 sayNO;
4753 }
4754 locinput += 2;
4755 }
4756 else { /* Handle above Latin-1 code points */
4757 classnum = (_char_class_number) FLAGS(scan);
4758 if (classnum < _FIRST_NON_SWASH_CC) {
4759
4760 /* Here, uses a swash to find such code points. Load if if
4761 * not done already */
4762 if (! PL_utf8_swash_ptrs[classnum]) {
4763 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4764 PL_utf8_swash_ptrs[classnum]
4765 = _core_swash_init("utf8",
2a16ac92
KW
4766 "",
4767 &PL_sv_undef, 1, 0,
4768 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
4769 }
4770 if (! (to_complement
4771 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4772 (U8 *) locinput, TRUE))))
4773 {
4774 sayNO;
4775 }
4776 }
4777 else { /* Here, uses macros to find above Latin-1 code points */
4778 switch (classnum) {
4779 case _CC_ENUM_SPACE: /* XXX would require separate
4780 code if we revert the change
4781 of \v matching this */
4782 case _CC_ENUM_PSXSPC:
4783 if (! (to_complement
4784 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4785 {
4786 sayNO;
4787 }
4788 break;
4789 case _CC_ENUM_BLANK:
4790 if (! (to_complement
4791 ^ cBOOL(is_HORIZWS_high(locinput))))
4792 {
4793 sayNO;
4794 }
4795 break;
4796 case _CC_ENUM_XDIGIT:
4797 if (! (to_complement
4798 ^ cBOOL(is_XDIGIT_high(locinput))))
4799 {
4800 sayNO;
4801 }
4802 break;
4803 case _CC_ENUM_VERTSPACE:
4804 if (! (to_complement
4805 ^ cBOOL(is_VERTWS_high(locinput))))
4806 {
4807 sayNO;
4808 }
4809 break;
4810 default: /* The rest, e.g. [:cntrl:], can't match
4811 above Latin1 */
4812 if (! to_complement) {
4813 sayNO;
4814 }
4815 break;
4816 }
4817 }
4818 locinput += UTF8SKIP(locinput);
4819 }
4820 break;
0658cdde 4821
37e2e78e
KW
4822 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4823 a Unicode extended Grapheme Cluster */
4824 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4825 extended Grapheme Cluster is:
4826
7aee35ff
KW
4827 CR LF
4828 | Prepend* Begin Extend*
4829 | .
37e2e78e 4830
7aee35ff
KW
4831 Begin is: ( Special_Begin | ! Control )
4832 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4833 Extend is: ( Grapheme_Extend | Spacing_Mark )
4834 Control is: [ GCB_Control | CR | LF ]
4835 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
37e2e78e 4836
27d4fc33
KW
4837 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4838 we can rewrite
4839
4840 Begin is ( Regular_Begin + Special Begin )
4841
4842 It turns out that 98.4% of all Unicode code points match
4843 Regular_Begin. Doing it this way eliminates a table match in
c101f46d 4844 the previous implementation for almost all Unicode code points.
27d4fc33 4845
37e2e78e
KW
4846 There is a subtlety with Prepend* which showed up in testing.
4847 Note that the Begin, and only the Begin is required in:
4848 | Prepend* Begin Extend*
cc3b396d
KW
4849 Also, Begin contains '! Control'. A Prepend must be a
4850 '! Control', which means it must also be a Begin. What it
4851 comes down to is that if we match Prepend* and then find no
4852 suitable Begin afterwards, that if we backtrack the last
4853 Prepend, that one will be a suitable Begin.
37e2e78e
KW
4854 */
4855
7016d6eb 4856 if (NEXTCHR_IS_EOS)
a0ed51b3 4857 sayNO;
f2ed9b32 4858 if (! utf8_target) {
37e2e78e
KW
4859
4860 /* Match either CR LF or '.', as all the other possibilities
4861 * require utf8 */
4862 locinput++; /* Match the . or CR */
cc3b396d
KW
4863 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4864 match the LF */
220db18a 4865 && locinput < reginfo->strend
e699a1d5
KW
4866 && UCHARAT(locinput) == '\n')
4867 {
4868 locinput++;
4869 }
37e2e78e
KW
4870 }
4871 else {
4872
4873 /* Utf8: See if is ( CR LF ); already know that locinput <
220db18a
DM
4874 * reginfo->strend, so locinput+1 is in bounds */
4875 if ( nextchr == '\r' && locinput+1 < reginfo->strend
f67f9e53 4876 && UCHARAT(locinput + 1) == '\n')
7016d6eb 4877 {
37e2e78e
KW
4878 locinput += 2;
4879 }
4880 else {
45fdf108
KW
4881 STRLEN len;
4882
37e2e78e
KW
4883 /* In case have to backtrack to beginning, then match '.' */
4884 char *starting = locinput;
4885
4886 /* In case have to backtrack the last prepend */
e699a1d5 4887 char *previous_prepend = NULL;
37e2e78e
KW
4888
4889 LOAD_UTF8_CHARCLASS_GCB();
4890
45fdf108 4891 /* Match (prepend)* */
220db18a 4892 while (locinput < reginfo->strend
45fdf108
KW
4893 && (len = is_GCB_Prepend_utf8(locinput)))
4894 {
4895 previous_prepend = locinput;
4896 locinput += len;
a1853d78 4897 }
37e2e78e
KW
4898
4899 /* As noted above, if we matched a prepend character, but
4900 * the next thing won't match, back off the last prepend we
4901 * matched, as it is guaranteed to match the begin */
4902 if (previous_prepend
220db18a 4903 && (locinput >= reginfo->strend
c101f46d
KW
4904 || (! swash_fetch(PL_utf8_X_regular_begin,
4905 (U8*)locinput, utf8_target)
bff53399 4906 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
c101f46d 4907 )
37e2e78e
KW
4908 {
4909 locinput = previous_prepend;
4910 }
4911
220db18a 4912 /* Note that here we know reginfo->strend > locinput, as we
37e2e78e
KW
4913 * tested that upon input to this switch case, and if we
4914 * moved locinput forward, we tested the result just above
4915 * and it either passed, or we backed off so that it will
4916 * now pass */
11dfcd49
KW
4917 if (swash_fetch(PL_utf8_X_regular_begin,
4918 (U8*)locinput, utf8_target)) {
27d4fc33
KW
4919 locinput += UTF8SKIP(locinput);
4920 }
bff53399 4921 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
37e2e78e
KW
4922
4923 /* Here did not match the required 'Begin' in the
4924 * second term. So just match the very first
4925 * character, the '.' of the final term of the regex */
4926 locinput = starting + UTF8SKIP(starting);
27d4fc33 4927 goto exit_utf8;
37e2e78e
KW
4928 } else {
4929
11dfcd49
KW
4930 /* Here is a special begin. It can be composed of
4931 * several individual characters. One possibility is
4932 * RI+ */
45fdf108
KW
4933 if ((len = is_GCB_RI_utf8(locinput))) {
4934 locinput += len;
220db18a 4935 while (locinput < reginfo->strend
45fdf108 4936 && (len = is_GCB_RI_utf8(locinput)))
11dfcd49 4937 {
45fdf108 4938 locinput += len;
11dfcd49 4939 }
45fdf108
KW
4940 } else if ((len = is_GCB_T_utf8(locinput))) {
4941 /* Another possibility is T+ */
4942 locinput += len;
220db18a 4943 while (locinput < reginfo->strend
45fdf108 4944 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4945 {
45fdf108 4946 locinput += len;
11dfcd49
KW
4947 }
4948 } else {
4949
4950 /* Here, neither RI+ nor T+; must be some other
4951 * Hangul. That means it is one of the others: L,
4952 * LV, LVT or V, and matches:
4953 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4954
4955 /* Match L* */
220db18a 4956 while (locinput < reginfo->strend
45fdf108 4957 && (len = is_GCB_L_utf8(locinput)))
11dfcd49 4958 {
45fdf108 4959 locinput += len;
11dfcd49 4960 }
37e2e78e 4961
11dfcd49
KW
4962 /* Here, have exhausted L*. If the next character
4963 * is not an LV, LVT nor V, it means we had to have
4964 * at least one L, so matches L+ in the original
4965 * equation, we have a complete hangul syllable.
4966 * Are done. */
4967
220db18a 4968 if (locinput < reginfo->strend
45fdf108 4969 && is_GCB_LV_LVT_V_utf8(locinput))
11dfcd49 4970 {
11dfcd49 4971 /* Otherwise keep going. Must be LV, LVT or V.
7d43c479
KW
4972 * See if LVT, by first ruling out V, then LV */
4973 if (! is_GCB_V_utf8(locinput)
4974 /* All but every TCount one is LV */
4975 && (valid_utf8_to_uvchr((U8 *) locinput,
4976 NULL)
4977 - SBASE)
4978 % TCount != 0)
4979 {
11dfcd49
KW
4980 locinput += UTF8SKIP(locinput);
4981 } else {
4982
4983 /* Must be V or LV. Take it, then match
4984 * V* */
4985 locinput += UTF8SKIP(locinput);
220db18a 4986 while (locinput < reginfo->strend
45fdf108 4987 && (len = is_GCB_V_utf8(locinput)))
11dfcd49 4988 {
45fdf108 4989 locinput += len;
11dfcd49
KW
4990 }
4991 }
37e2e78e 4992
11dfcd49 4993 /* And any of LV, LVT, or V can be followed
45fdf108 4994 * by T* */
220db18a 4995 while (locinput < reginfo->strend
45fdf108 4996 && (len = is_GCB_T_utf8(locinput)))
11dfcd49 4997 {
45fdf108 4998 locinput += len;
11dfcd49
KW
4999 }
5000 }
cd94d768 5001 }
11dfcd49 5002 }
37e2e78e 5003
11dfcd49 5004 /* Match any extender */
220db18a 5005 while (locinput < reginfo->strend
11dfcd49
KW
5006 && swash_fetch(PL_utf8_X_extend,
5007 (U8*)locinput, utf8_target))
5008 {
5009 locinput += UTF8SKIP(locinput);
5010 }
37e2e78e 5011 }
27d4fc33 5012 exit_utf8:
220db18a 5013 if (locinput > reginfo->strend) sayNO;
37e2e78e 5014 }
a0ed51b3 5015 break;
81714fb9 5016
3c0563b9 5017 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
5018 { /* The capture buffer cases. The ones beginning with N for the
5019 named buffers just convert to the equivalent numbered and
5020 pretend they were called as the corresponding numbered buffer
5021 op. */
26ecd678
TC
5022 /* don't initialize these in the declaration, it makes C++
5023 unhappy */
9d9163fb 5024 const char *s;
ff1157ca 5025 char type;
8368298a
TC
5026 re_fold_t folder;
5027 const U8 *fold_array;
26ecd678 5028 UV utf8_fold_flags;
8368298a 5029
d7ef4b73
KW
5030 folder = foldEQ_locale;
5031 fold_array = PL_fold_locale;
5032 type = REFFL;
cea315b6 5033 utf8_fold_flags = FOLDEQ_LOCALE;
d7ef4b73
KW
5034 goto do_nref;
5035
3c0563b9 5036 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
5037 folder = foldEQ_latin1;
5038 fold_array = PL_fold_latin1;
5039 type = REFFA;
5040 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5041 goto do_nref;
5042
3c0563b9 5043 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
5044 folder = foldEQ_latin1;
5045 fold_array = PL_fold_latin1;
5046 type = REFFU;
d513472c 5047 utf8_fold_flags = 0;
d7ef4b73
KW
5048 goto do_nref;
5049
3c0563b9 5050 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
5051 folder = foldEQ;
5052 fold_array = PL_fold;
5053 type = REFF;
d513472c 5054 utf8_fold_flags = 0;
d7ef4b73
KW
5055 goto do_nref;
5056
3c0563b9 5057 case NREF: /* /\g{name}/ */
d7ef4b73 5058 type = REF;
83d7b90b
KW
5059 folder = NULL;
5060 fold_array = NULL;
d513472c 5061 utf8_fold_flags = 0;
d7ef4b73
KW
5062 do_nref:
5063
5064 /* For the named back references, find the corresponding buffer
5065 * number */
0a4db386
YO
5066 n = reg_check_named_buff_matched(rex,scan);
5067
d7ef4b73 5068 if ( ! n ) {
81714fb9 5069 sayNO;
d7ef4b73
KW
5070 }
5071 goto do_nref_ref_common;
5072
3c0563b9 5073 case REFFL: /* /\1/il */
d7ef4b73
KW
5074 folder = foldEQ_locale;
5075 fold_array = PL_fold_locale;
cea315b6 5076 utf8_fold_flags = FOLDEQ_LOCALE;
d7ef4b73
KW
5077 goto do_ref;
5078
3c0563b9 5079 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
5080 folder = foldEQ_latin1;
5081 fold_array = PL_fold_latin1;
5082 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5083 goto do_ref;
5084
3c0563b9 5085 case REFFU: /* /\1/iu */
d7ef4b73
KW
5086 folder = foldEQ_latin1;
5087 fold_array = PL_fold_latin1;
d513472c 5088 utf8_fold_flags = 0;
d7ef4b73
KW
5089 goto do_ref;
5090
3c0563b9 5091 case REFF: /* /\1/i */
d7ef4b73
KW
5092 folder = foldEQ;
5093 fold_array = PL_fold;
d513472c 5094 utf8_fold_flags = 0;
83d7b90b 5095 goto do_ref;
d7ef4b73 5096
3c0563b9 5097 case REF: /* /\1/ */
83d7b90b
KW
5098 folder = NULL;
5099 fold_array = NULL;
d513472c 5100 utf8_fold_flags = 0;
83d7b90b 5101
d7ef4b73 5102 do_ref:
81714fb9 5103 type = OP(scan);
d7ef4b73
KW
5104 n = ARG(scan); /* which paren pair */
5105
5106 do_nref_ref_common:
b93070ed 5107 ln = rex->offs[n].start;
1cb48e53 5108 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
b93070ed 5109 if (rex->lastparen < n || ln == -1)
af3f8c16 5110 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 5111 if (ln == rex->offs[n].end)
a0d0e21e 5112 break;
a0ed51b3 5113
9d9163fb 5114 s = reginfo->strbeg + ln;
d7ef4b73 5115 if (type != REF /* REF can do byte comparison */
31f05a37
KW
5116 && (utf8_target || type == REFFU || type == REFFL))
5117 {
220db18a 5118 char * limit = reginfo->strend;
d7ef4b73
KW
5119
5120 /* This call case insensitively compares the entire buffer
5121 * at s, with the current input starting at locinput, but
220db18a
DM
5122 * not going off the end given by reginfo->strend, and
5123 * returns in <limit> upon success, how much of the
5124 * current input was matched */
b93070ed 5125 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 5126 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
5127 {
5128 sayNO;
a0ed51b3 5129 }
d7ef4b73 5130 locinput = limit;
a0ed51b3
LW
5131 break;
5132 }
5133
d7ef4b73 5134 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
5135 if (!NEXTCHR_IS_EOS &&
5136 UCHARAT(s) != nextchr &&
81714fb9 5137 (type == REF ||
d7ef4b73 5138 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 5139 sayNO;
b93070ed 5140 ln = rex->offs[n].end - ln;
220db18a 5141 if (locinput + ln > reginfo->strend)
4633a7c4 5142 sayNO;
81714fb9 5143 if (ln > 1 && (type == REF
24d3c4a9 5144 ? memNE(s, locinput, ln)
d7ef4b73 5145 : ! folder(s, locinput, ln)))
4633a7c4 5146 sayNO;
24d3c4a9 5147 locinput += ln;
a0d0e21e 5148 break;
81714fb9 5149 }
3c0563b9
DM
5150
5151 case NOTHING: /* null op; e.g. the 'nothing' following
5152 * the '*' in m{(a+|b)*}' */
5153 break;
5154 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 5155 break;
3c0563b9
DM
5156
5157 case BACK: /* ??? doesn't appear to be used ??? */
a0d0e21e 5158 break;
40a82448
DM
5159
5160#undef ST
5161#define ST st->u.eval
c277df42 5162 {
c277df42 5163 SV *ret;
d2f13c59 5164 REGEXP *re_sv;
6bda09f9 5165 regexp *re;
f8fc2ecf 5166 regexp_internal *rei;
1a147d38
YO
5167 regnode *startpoint;
5168
3c0563b9 5169 case GOSTART: /* (?R) */
e7707071
YO
5170 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
5171 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 5172 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 5173 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 5174 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
5175 Perl_croak(aTHX_
5176 "Pattern subroutine nesting without pos change"
5177 " exceeded limit in regex");
6bda09f9
YO
5178 } else {
5179 nochange_depth = 0;
1a147d38 5180 }
288b8c02 5181 re_sv = rex_sv;
6bda09f9 5182 re = rex;
f8fc2ecf 5183 rei = rexi;
1a147d38 5184 if (OP(scan)==GOSUB) {
6bda09f9
YO
5185 startpoint = scan + ARG2L(scan);
5186 ST.close_paren = ARG(scan);
5187 } else {
f8fc2ecf 5188 startpoint = rei->program+1;
6bda09f9
YO
5189 ST.close_paren = 0;
5190 }
d1b2014a
YO
5191
5192 /* Save all the positions seen so far. */
5193 ST.cp = regcppush(rex, 0, maxopenparen);
5194 REGCP_SET(ST.lastcp);
5195
5196 /* and then jump to the code we share with EVAL */
6bda09f9 5197 goto eval_recurse_doit;
d1b2014a 5198
118e2215 5199 assert(0); /* NOTREACHED */
3c0563b9 5200
6bda09f9
YO
5201 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
5202 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 5203 if ( ++nochange_depth > max_nochange_depth )
1a147d38 5204 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
5205 } else {
5206 nochange_depth = 0;
5207 }
8e5e9ebe 5208 {
4aabdb9b 5209 /* execute the code in the {...} */
81ed78b2 5210
4aabdb9b 5211 dSP;
a6dc34f1 5212 IV before;
1f4d1a1e 5213 OP * const oop = PL_op;
4aabdb9b 5214 COP * const ocurcop = PL_curcop;
81ed78b2 5215 OP *nop;
81ed78b2 5216 CV *newcv;
91332126 5217
74088413 5218 /* save *all* paren positions */
92da3157 5219 regcppush(rex, 0, maxopenparen);
74088413
DM
5220 REGCP_SET(runops_cp);
5221
81ed78b2
DM
5222 if (!caller_cv)
5223 caller_cv = find_runcv(NULL);
5224
4aabdb9b 5225 n = ARG(scan);
81ed78b2 5226
b30fcab9 5227 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 5228 newcv = (ReANY(
b30fcab9
DM
5229 (REGEXP*)(rexi->data->data[n])
5230 ))->qr_anoncv
81ed78b2
DM
5231 ;
5232 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
5233 }
5234 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
5235 newcv = caller_cv;
5236 nop = (OP*)rexi->data->data[n];
5237 assert(CvDEPTH(newcv));
68e2671b
DM
5238 }
5239 else {
d24ca0c5
DM
5240 /* literal with own CV */
5241 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
5242 newcv = rex->qr_anoncv;
5243 nop = (OP*)rexi->data->data[n];
68e2671b 5244 }
81ed78b2 5245
0e458318
DM
5246 /* normally if we're about to execute code from the same
5247 * CV that we used previously, we just use the existing
5248 * CX stack entry. However, its possible that in the
5249 * meantime we may have backtracked, popped from the save
5250 * stack, and undone the SAVECOMPPAD(s) associated with
5251 * PUSH_MULTICALL; in which case PL_comppad no longer
5252 * points to newcv's pad. */
5253 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5254 {
b0065247
DM
5255 U8 flags = (CXp_SUB_RE |
5256 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
0e458318 5257 if (last_pushed_cv) {
b0065247 5258 CHANGE_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
5259 }
5260 else {
b0065247 5261 PUSH_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
5262 }
5263 last_pushed_cv = newcv;
5264 }
c31ee3bb
DM
5265 else {
5266 /* these assignments are just to silence compiler
5267 * warnings */
5268 multicall_cop = NULL;
5269 newsp = NULL;
5270 }
0e458318
DM
5271 last_pad = PL_comppad;
5272
2e2e3f36
DM
5273 /* the initial nextstate you would normally execute
5274 * at the start of an eval (which would cause error
5275 * messages to come from the eval), may be optimised
5276 * away from the execution path in the regex code blocks;
5277 * so manually set PL_curcop to it initially */
5278 {
81ed78b2 5279 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
5280 assert(o->op_type == OP_NULL);
5281 if (o->op_targ == OP_SCOPE) {
5282 o = cUNOPo->op_first;
5283 }
5284 else {
5285 assert(o->op_targ == OP_LEAVE);
5286 o = cUNOPo->op_first;
5287 assert(o->op_type == OP_ENTER);
5288 o = o->op_sibling;
5289 }
5290
5291 if (o->op_type != OP_STUB) {
5292 assert( o->op_type == OP_NEXTSTATE
5293 || o->op_type == OP_DBSTATE
5294 || (o->op_type == OP_NULL
5295 && ( o->op_targ == OP_NEXTSTATE
5296 || o->op_targ == OP_DBSTATE
5297 )
5298 )
5299 );
5300 PL_curcop = (COP*)o;
5301 }
5302 }
81ed78b2 5303 nop = nop->op_next;
2e2e3f36 5304
24b23f37 5305 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
5306 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5307
8adc0f72 5308 rex->offs[0].end = locinput - reginfo->strbeg;
bf2039a9 5309 if (reginfo->info_aux_eval->pos_magic)
25fdce4a
FC
5310 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5311 reginfo->sv, reginfo->strbeg,
5312 locinput - reginfo->strbeg);
4aabdb9b 5313
2bf803e2
YO
5314 if (sv_yes_mark) {
5315 SV *sv_mrk = get_sv("REGMARK", 1);
5316 sv_setsv(sv_mrk, sv_yes_mark);
5317 }
5318
81ed78b2
DM
5319 /* we don't use MULTICALL here as we want to call the
5320 * first op of the block of interest, rather than the
5321 * first op of the sub */
a6dc34f1 5322 before = (IV)(SP-PL_stack_base);
81ed78b2 5323 PL_op = nop;
8e5e9ebe
RGS
5324 CALLRUNOPS(aTHX); /* Scalar context. */
5325 SPAGAIN;
a6dc34f1 5326 if ((IV)(SP-PL_stack_base) == before)
075aa684 5327 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
5328 else {
5329 ret = POPs;
5330 PUTBACK;
5331 }
4aabdb9b 5332
e4bfbed3
DM
5333 /* before restoring everything, evaluate the returned
5334 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
5335 * PL_op or pad. Also need to process any magic vars
5336 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
5337
5338 PL_op = NULL;
5339
5e98dac2 5340 re_sv = NULL;
e4bfbed3
DM
5341 if (logical == 0) /* (?{})/ */
5342 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5343 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5344 sw = cBOOL(SvTRUE(ret));
5345 logical = 0;
5346 }
5347 else { /* /(??{}) */
497d0a96
DM
5348 /* if its overloaded, let the regex compiler handle
5349 * it; otherwise extract regex, or stringify */
237da807 5350 if (SvGMAGICAL(ret))
2685dc2d 5351 ret = sv_mortalcopy(ret);
497d0a96
DM
5352 if (!SvAMAGIC(ret)) {
5353 SV *sv = ret;
5354 if (SvROK(sv))
5355 sv = SvRV(sv);
5356 if (SvTYPE(sv) == SVt_REGEXP)
5357 re_sv = (REGEXP*) sv;
63620942
FC
5358 else if (SvSMAGICAL(ret)) {
5359 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
497d0a96
DM
5360 if (mg)
5361 re_sv = (REGEXP *) mg->mg_obj;
5362 }
e4bfbed3 5363
2685dc2d 5364 /* force any undef warnings here */
237da807
FC
5365 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5366 ret = sv_mortalcopy(ret);
497d0a96
DM
5367 (void) SvPV_force_nolen(ret);
5368 }
e4bfbed3
DM
5369 }
5370
5371 }
5372
81ed78b2
DM
5373 /* *** Note that at this point we don't restore
5374 * PL_comppad, (or pop the CxSUB) on the assumption it may
5375 * be used again soon. This is safe as long as nothing
5376 * in the regexp code uses the pad ! */
4aabdb9b 5377 PL_op = oop;
4aabdb9b 5378 PL_curcop = ocurcop;
92da3157 5379 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
f5df269c 5380 PL_curpm = PL_reg_curpm;
e4bfbed3
DM
5381
5382 if (logical != 2)
4aabdb9b 5383 break;
8e5e9ebe 5384 }
e4bfbed3
DM
5385
5386 /* only /(??{})/ from now on */
24d3c4a9 5387 logical = 0;
4aabdb9b 5388 {
4f639d21
DM
5389 /* extract RE object from returned value; compiling if
5390 * necessary */
5c35adbb 5391
575c37f6
DM
5392 if (re_sv) {
5393 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 5394 }
0f5d15d6 5395 else {
c737faaf 5396 U32 pm_flags = 0;
0f5d15d6 5397
9753d940
DM
5398 if (SvUTF8(ret) && IN_BYTES) {
5399 /* In use 'bytes': make a copy of the octet
5400 * sequence, but without the flag on */
b9ad30b4
NC
5401 STRLEN len;
5402 const char *const p = SvPV(ret, len);
5403 ret = newSVpvn_flags(p, len, SVs_TEMP);
5404 }
732caac7
DM
5405 if (rex->intflags & PREGf_USE_RE_EVAL)
5406 pm_flags |= PMf_USE_RE_EVAL;
5407
5408 /* if we got here, it should be an engine which
5409 * supports compiling code blocks and stuff */
5410 assert(rex->engine && rex->engine->op_comp);
ec841a27 5411 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 5412 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27
DM
5413 rex->engine, NULL, NULL,
5414 /* copy /msix etc to inner pattern */
5415 scan->flags,
5416 pm_flags);
732caac7 5417
9041c2e3 5418 if (!(SvFLAGS(ret)
237da807
FC
5419 & (SVs_TEMP | SVs_GMG | SVf_ROK))
5420 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
a2794585
NC
5421 /* This isn't a first class regexp. Instead, it's
5422 caching a regexp onto an existing, Perl visible
5423 scalar. */
575c37f6 5424 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 5425 }
0f5d15d6 5426 }
e1ff3a88 5427 SAVEFREESV(re_sv);
8d919b0a 5428 re = ReANY(re_sv);
4aabdb9b 5429 }
07bc277f 5430 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
5431 re->subbeg = rex->subbeg;
5432 re->sublen = rex->sublen;
6502e081
DM
5433 re->suboffset = rex->suboffset;
5434 re->subcoffset = rex->subcoffset;
d1b2014a
YO
5435 re->lastparen = 0;
5436 re->lastcloseparen = 0;
f8fc2ecf 5437 rei = RXi_GET(re);
6bda09f9 5438 DEBUG_EXECUTE_r(
220db18a
DM
5439 debug_start_match(re_sv, utf8_target, locinput,
5440 reginfo->strend, "Matching embedded");
6bda09f9 5441 );
f8fc2ecf 5442 startpoint = rei->program + 1;
1a147d38 5443 ST.close_paren = 0; /* only used for GOSUB */
d1b2014a
YO
5444 /* Save all the seen positions so far. */
5445 ST.cp = regcppush(rex, 0, maxopenparen);
5446 REGCP_SET(ST.lastcp);
5447 /* and set maxopenparen to 0, since we are starting a "fresh" match */
5448 maxopenparen = 0;
5449 /* run the pattern returned from (??{...}) */
aa283a38 5450
d1b2014a
YO
5451 eval_recurse_doit: /* Share code with GOSUB below this line
5452 * At this point we expect the stack context to be
5453 * set up correctly */
4aabdb9b 5454
1cb95af7
DM
5455 /* invalidate the S-L poscache. We're now executing a
5456 * different set of WHILEM ops (and their associated
5457 * indexes) against the same string, so the bits in the
5458 * cache are meaningless. Setting maxiter to zero forces
5459 * the cache to be invalidated and zeroed before reuse.
5460 * XXX This is too dramatic a measure. Ideally we should
5461 * save the old cache and restore when running the outer
5462 * pattern again */
1cb48e53 5463 reginfo->poscache_maxiter = 0;
4aabdb9b 5464
d1b2014a 5465 /* the new regexp might have a different is_utf8_pat than we do */
aed7b151 5466 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 5467
288b8c02 5468 ST.prev_rex = rex_sv;
faec1544 5469 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
5470 rex_sv = re_sv;
5471 SET_reg_curpm(rex_sv);
288b8c02 5472 rex = re;
f8fc2ecf 5473 rexi = rei;
faec1544 5474 cur_curlyx = NULL;
40a82448 5475 ST.B = next;
faec1544
DM
5476 ST.prev_eval = cur_eval;
5477 cur_eval = st;
faec1544 5478 /* now continue from first node in postoned RE */
4d5016e5 5479 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
118e2215 5480 assert(0); /* NOTREACHED */
c277df42 5481 }
40a82448 5482
faec1544
DM
5483 case EVAL_AB: /* cleanup after a successful (??{A})B */
5484 /* note: this is called twice; first after popping B, then A */
ec43f78b 5485 rex_sv = ST.prev_rex;
aed7b151 5486 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 5487 SET_reg_curpm(rex_sv);
8d919b0a 5488 rex = ReANY(rex_sv);
f8fc2ecf 5489 rexi = RXi_GET(rex);
4b22688e
YO
5490 {
5491 /* preserve $^R across LEAVE's. See Bug 121070. */
5492 SV *save_sv= GvSV(PL_replgv);
5493 SvREFCNT_inc(save_sv);
5494 regcpblow(ST.cp); /* LEAVE in disguise */
5495 sv_setsv(GvSV(PL_replgv), save_sv);
5496 SvREFCNT_dec(save_sv);
5497 }
faec1544
DM
5498 cur_eval = ST.prev_eval;
5499 cur_curlyx = ST.prev_curlyx;
34a81e2b 5500
1cb95af7 5501 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 5502 reginfo->poscache_maxiter = 0;
e7707071 5503 if ( nochange_depth )
4b196cd4 5504 nochange_depth--;
262b90c4 5505 sayYES;
40a82448 5506
40a82448 5507
faec1544
DM
5508 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5509 /* note: this is called twice; first after popping B, then A */
ec43f78b 5510 rex_sv = ST.prev_rex;
aed7b151 5511 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 5512 SET_reg_curpm(rex_sv);
8d919b0a 5513 rex = ReANY(rex_sv);
f8fc2ecf 5514 rexi = RXi_GET(rex);
0357f1fd 5515
40a82448 5516 REGCP_UNWIND(ST.lastcp);
92da3157 5517 regcppop(rex, &maxopenparen);
faec1544
DM
5518 cur_eval = ST.prev_eval;
5519 cur_curlyx = ST.prev_curlyx;
1cb95af7 5520 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 5521 reginfo->poscache_maxiter = 0;
e7707071 5522 if ( nochange_depth )
4b196cd4 5523 nochange_depth--;
40a82448 5524 sayNO_SILENT;
40a82448
DM
5525#undef ST
5526
3c0563b9 5527 case OPEN: /* ( */
c277df42 5528 n = ARG(scan); /* which paren pair */
9d9163fb 5529 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
92da3157
DM
5530 if (n > maxopenparen)
5531 maxopenparen = n;
495f47a5 5532 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 5533 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
5534 PTR2UV(rex),
5535 PTR2UV(rex->offs),
5536 (UV)n,
5537 (IV)rex->offs[n].start_tmp,
92da3157 5538 (UV)maxopenparen
495f47a5 5539 ));
e2e6a0f1 5540 lastopen = n;
a0d0e21e 5541 break;
495f47a5
DM
5542
5543/* XXX really need to log other places start/end are set too */
5544#define CLOSE_CAPTURE \
5545 rex->offs[n].start = rex->offs[n].start_tmp; \
9d9163fb 5546 rex->offs[n].end = locinput - reginfo->strbeg; \
495f47a5
DM
5547 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5548 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5549 PTR2UV(rex), \
5550 PTR2UV(rex->offs), \
5551 (UV)n, \
5552 (IV)rex->offs[n].start, \
5553 (IV)rex->offs[n].end \
5554 ))
5555
3c0563b9 5556 case CLOSE: /* ) */
c277df42 5557 n = ARG(scan); /* which paren pair */
495f47a5 5558 CLOSE_CAPTURE;
b93070ed
DM
5559 if (n > rex->lastparen)
5560 rex->lastparen = n;
5561 rex->lastcloseparen = n;
3b6647e0 5562 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
5563 goto fake_end;
5564 }
a0d0e21e 5565 break;
3c0563b9
DM
5566
5567 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
5568 if (ARG(scan)){
5569 regnode *cursor;
5570 for (cursor=scan;
5571 cursor && OP(cursor)!=END;
5572 cursor=regnext(cursor))
5573 {
5574 if ( OP(cursor)==CLOSE ){
5575 n = ARG(cursor);
5576 if ( n <= lastopen ) {
495f47a5 5577 CLOSE_CAPTURE;
b93070ed
DM
5578 if (n > rex->lastparen)
5579 rex->lastparen = n;
5580 rex->lastcloseparen = n;
3b6647e0
RB
5581 if ( n == ARG(scan) || (cur_eval &&
5582 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
5583 break;
5584 }
5585 }
5586 }
5587 }
5588 goto fake_end;
5589 /*NOTREACHED*/
3c0563b9
DM
5590
5591 case GROUPP: /* (?(1)) */
c277df42 5592 n = ARG(scan); /* which paren pair */
b93070ed 5593 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 5594 break;
3c0563b9
DM
5595
5596 case NGROUPP: /* (?(<name>)) */
0a4db386 5597 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 5598 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 5599 break;
3c0563b9
DM
5600
5601 case INSUBP: /* (?(R)) */
0a4db386 5602 n = ARG(scan);
3b6647e0 5603 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 5604 break;
3c0563b9
DM
5605
5606 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
5607 sw = 0;
5608 break;
3c0563b9
DM
5609
5610 case IFTHEN: /* (?(cond)A|B) */
1cb48e53 5611 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
24d3c4a9 5612 if (sw)
c277df42
IZ
5613 next = NEXTOPER(NEXTOPER(scan));
5614 else {
5615 next = scan + ARG(scan);
5616 if (OP(next) == IFTHEN) /* Fake one. */
5617 next = NEXTOPER(NEXTOPER(next));
5618 }
5619 break;
3c0563b9
DM
5620
5621 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 5622 logical = scan->flags;
c277df42 5623 break;
c476f425 5624
2ab05381 5625/*******************************************************************
2ab05381 5626
c476f425
DM
5627The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5628pattern, where A and B are subpatterns. (For simple A, CURLYM or
5629STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 5630
c476f425 5631A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 5632
c476f425
DM
5633On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5634state, which contains the current count, initialised to -1. It also sets
5635cur_curlyx to point to this state, with any previous value saved in the
5636state block.
2ab05381 5637
c476f425
DM
5638CURLYX then jumps straight to the WHILEM op, rather than executing A,
5639since the pattern may possibly match zero times (i.e. it's a while {} loop
5640rather than a do {} while loop).
2ab05381 5641
c476f425
DM
5642Each entry to WHILEM represents a successful match of A. The count in the
5643CURLYX block is incremented, another WHILEM state is pushed, and execution
5644passes to A or B depending on greediness and the current count.
2ab05381 5645
c476f425
DM
5646For example, if matching against the string a1a2a3b (where the aN are
5647substrings that match /A/), then the match progresses as follows: (the
5648pushed states are interspersed with the bits of strings matched so far):
2ab05381 5649
c476f425
DM
5650 <CURLYX cnt=-1>
5651 <CURLYX cnt=0><WHILEM>
5652 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5653 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5654 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5655 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 5656
c476f425
DM
5657(Contrast this with something like CURLYM, which maintains only a single
5658backtrack state:
2ab05381 5659
c476f425
DM
5660 <CURLYM cnt=0> a1
5661 a1 <CURLYM cnt=1> a2
5662 a1 a2 <CURLYM cnt=2> a3
5663 a1 a2 a3 <CURLYM cnt=3> b
5664)
2ab05381 5665
c476f425
DM
5666Each WHILEM state block marks a point to backtrack to upon partial failure
5667of A or B, and also contains some minor state data related to that
5668iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5669overall state, such as the count, and pointers to the A and B ops.
2ab05381 5670
c476f425
DM
5671This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5672must always point to the *current* CURLYX block, the rules are:
2ab05381 5673
c476f425
DM
5674When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5675and set cur_curlyx to point the new block.
2ab05381 5676
c476f425
DM
5677When popping the CURLYX block after a successful or unsuccessful match,
5678restore the previous cur_curlyx.
2ab05381 5679
c476f425
DM
5680When WHILEM is about to execute B, save the current cur_curlyx, and set it
5681to the outer one saved in the CURLYX block.
2ab05381 5682
c476f425
DM
5683When popping the WHILEM block after a successful or unsuccessful B match,
5684restore the previous cur_curlyx.
2ab05381 5685
c476f425
DM
5686Here's an example for the pattern (AI* BI)*BO
5687I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 5688
c476f425
DM
5689cur_
5690curlyx backtrack stack
5691------ ---------------
5692NULL
5693CO <CO prev=NULL> <WO>
5694CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5695CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5696NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 5697
c476f425
DM
5698At this point the pattern succeeds, and we work back down the stack to
5699clean up, restoring as we go:
95b24440 5700
c476f425
DM
5701CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5702CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5703CO <CO prev=NULL> <WO>
5704NULL
a0374537 5705
c476f425
DM
5706*******************************************************************/
5707
5708#define ST st->u.curlyx
5709
5710 case CURLYX: /* start of /A*B/ (for complex A) */
5711 {
5712 /* No need to save/restore up to this paren */
5713 I32 parenfloor = scan->flags;
5714
5715 assert(next); /* keep Coverity happy */
5716 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5717 next += ARG(next);
5718
5719 /* XXXX Probably it is better to teach regpush to support
92da3157 5720 parenfloor > maxopenparen ... */
b93070ed
DM
5721 if (parenfloor > (I32)rex->lastparen)
5722 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
5723
5724 ST.prev_curlyx= cur_curlyx;
5725 cur_curlyx = st;
5726 ST.cp = PL_savestack_ix;
5727
5728 /* these fields contain the state of the current curly.
5729 * they are accessed by subsequent WHILEMs */
5730 ST.parenfloor = parenfloor;
d02d6d97 5731 ST.me = scan;
c476f425 5732 ST.B = next;
24d3c4a9
DM
5733 ST.minmod = minmod;
5734 minmod = 0;
c476f425
DM
5735 ST.count = -1; /* this will be updated by WHILEM */
5736 ST.lastloc = NULL; /* this will be updated by WHILEM */
5737
4d5016e5 5738 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
118e2215 5739 assert(0); /* NOTREACHED */
c476f425 5740 }
a0d0e21e 5741
c476f425 5742 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
5743 cur_curlyx = ST.prev_curlyx;
5744 sayYES;
118e2215 5745 assert(0); /* NOTREACHED */
a0d0e21e 5746
c476f425
DM
5747 case CURLYX_end_fail: /* just failed to match all of A*B */
5748 regcpblow(ST.cp);
5749 cur_curlyx = ST.prev_curlyx;
5750 sayNO;
118e2215 5751 assert(0); /* NOTREACHED */
4633a7c4 5752
a0d0e21e 5753
c476f425
DM
5754#undef ST
5755#define ST st->u.whilem
5756
5757 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5758 {
5759 /* see the discussion above about CURLYX/WHILEM */
c476f425 5760 I32 n;
d02d6d97
DM
5761 int min = ARG1(cur_curlyx->u.curlyx.me);
5762 int max = ARG2(cur_curlyx->u.curlyx.me);
5763 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5764
c476f425
DM
5765 assert(cur_curlyx); /* keep Coverity happy */
5766 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5767 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5768 ST.cache_offset = 0;
5769 ST.cache_mask = 0;
5770
c476f425
DM
5771
5772 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
5773 "%*s whilem: matched %ld out of %d..%d\n",
5774 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 5775 );
a0d0e21e 5776
c476f425 5777 /* First just match a string of min A's. */
a0d0e21e 5778
d02d6d97 5779 if (n < min) {
92da3157
DM
5780 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5781 maxopenparen);
c476f425 5782 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
5783 REGCP_SET(ST.lastcp);
5784
4d5016e5 5785 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
118e2215 5786 assert(0); /* NOTREACHED */
c476f425
DM
5787 }
5788
5789 /* If degenerate A matches "", assume A done. */
5790
5791 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5792 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5793 "%*s whilem: empty match detected, trying continuation...\n",
5794 REPORT_CODE_OFF+depth*2, "")
5795 );
5796 goto do_whilem_B_max;
5797 }
5798
1cb95af7
DM
5799 /* super-linear cache processing.
5800 *
5801 * The idea here is that for certain types of CURLYX/WHILEM -
5802 * principally those whose upper bound is infinity (and
5803 * excluding regexes that have things like \1 and other very
5804 * non-regular expresssiony things), then if a pattern like
5805 * /....A*.../ fails and we backtrack to the WHILEM, then we
5806 * make a note that this particular WHILEM op was at string
5807 * position 47 (say) when the rest of pattern failed. Then, if
5808 * we ever find ourselves back at that WHILEM, and at string
5809 * position 47 again, we can just fail immediately rather than
5810 * running the rest of the pattern again.
5811 *
5812 * This is very handy when patterns start to go
5813 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5814 * with a combinatorial explosion of backtracking.
5815 *
5816 * The cache is implemented as a bit array, with one bit per
5817 * string byte position per WHILEM op (up to 16) - so its
5818 * between 0.25 and 2x the string size.
5819 *
5820 * To avoid allocating a poscache buffer every time, we do an
5821 * initially countdown; only after we have executed a WHILEM
5822 * op (string-length x #WHILEMs) times do we allocate the
5823 * cache.
5824 *
5825 * The top 4 bits of scan->flags byte say how many different
5826 * relevant CURLLYX/WHILEM op pairs there are, while the
5827 * bottom 4-bits is the identifying index number of this
5828 * WHILEM.
5829 */
c476f425
DM
5830
5831 if (scan->flags) {
a0d0e21e 5832
1cb48e53 5833 if (!reginfo->poscache_maxiter) {
c476f425
DM
5834 /* start the countdown: Postpone detection until we
5835 * know the match is not *that* much linear. */
1cb48e53 5836 reginfo->poscache_maxiter
9d9163fb
DM
5837 = (reginfo->strend - reginfo->strbeg + 1)
5838 * (scan->flags>>4);
66bf836d 5839 /* possible overflow for long strings and many CURLYX's */
1cb48e53
DM
5840 if (reginfo->poscache_maxiter < 0)
5841 reginfo->poscache_maxiter = I32_MAX;
5842 reginfo->poscache_iter = reginfo->poscache_maxiter;
2c2d71f5 5843 }
c476f425 5844
1cb48e53 5845 if (reginfo->poscache_iter-- == 0) {
c476f425 5846 /* initialise cache */
ea3daa5d 5847 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
2ac8ff4b
DM
5848 regmatch_info_aux *const aux = reginfo->info_aux;
5849 if (aux->poscache) {
ea3daa5d 5850 if ((SSize_t)reginfo->poscache_size < size) {
2ac8ff4b
DM
5851 Renew(aux->poscache, size, char);
5852 reginfo->poscache_size = size;
2c2d71f5 5853 }
2ac8ff4b 5854 Zero(aux->poscache, size, char);
2c2d71f5
JH
5855 }
5856 else {
2ac8ff4b
DM
5857 reginfo->poscache_size = size;
5858 Newxz(aux->poscache, size, char);
2c2d71f5 5859 }
c476f425
DM
5860 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5861 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5862 PL_colors[4], PL_colors[5])
5863 );
2c2d71f5 5864 }
c476f425 5865
1cb48e53 5866 if (reginfo->poscache_iter < 0) {
c476f425 5867 /* have we already failed at this position? */
ea3daa5d 5868 SSize_t offset, mask;
338e600a
DM
5869
5870 reginfo->poscache_iter = -1; /* stop eventual underflow */
c476f425 5871 offset = (scan->flags & 0xf) - 1
9d9163fb
DM
5872 + (locinput - reginfo->strbeg)
5873 * (scan->flags>>4);
c476f425
DM
5874 mask = 1 << (offset % 8);
5875 offset /= 8;
2ac8ff4b 5876 if (reginfo->info_aux->poscache[offset] & mask) {
c476f425
DM
5877 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5878 "%*s whilem: (cache) already tried at this position...\n",
5879 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 5880 );
3298f257 5881 sayNO; /* cache records failure */
2c2d71f5 5882 }
c476f425
DM
5883 ST.cache_offset = offset;
5884 ST.cache_mask = mask;
2c2d71f5 5885 }
c476f425 5886 }
2c2d71f5 5887
c476f425 5888 /* Prefer B over A for minimal matching. */
a687059c 5889
c476f425
DM
5890 if (cur_curlyx->u.curlyx.minmod) {
5891 ST.save_curlyx = cur_curlyx;
5892 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
5893 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5894 maxopenparen);
c476f425 5895 REGCP_SET(ST.lastcp);
4d5016e5
DM
5896 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5897 locinput);
118e2215 5898 assert(0); /* NOTREACHED */
c476f425 5899 }
a0d0e21e 5900
c476f425
DM
5901 /* Prefer A over B for maximal matching. */
5902
d02d6d97 5903 if (n < max) { /* More greed allowed? */
92da3157
DM
5904 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5905 maxopenparen);
c476f425
DM
5906 cur_curlyx->u.curlyx.lastloc = locinput;
5907 REGCP_SET(ST.lastcp);
4d5016e5 5908 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
118e2215 5909 assert(0); /* NOTREACHED */
c476f425
DM
5910 }
5911 goto do_whilem_B_max;
5912 }
118e2215 5913 assert(0); /* NOTREACHED */
c476f425
DM
5914
5915 case WHILEM_B_min: /* just matched B in a minimal match */
5916 case WHILEM_B_max: /* just matched B in a maximal match */
5917 cur_curlyx = ST.save_curlyx;
5918 sayYES;
118e2215 5919 assert(0); /* NOTREACHED */
c476f425
DM
5920
5921 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5922 cur_curlyx = ST.save_curlyx;
5923 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5924 cur_curlyx->u.curlyx.count--;
5925 CACHEsayNO;
118e2215 5926 assert(0); /* NOTREACHED */
c476f425
DM
5927
5928 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
924ba076 5929 /* FALLTHROUGH */
c476f425 5930 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 5931 REGCP_UNWIND(ST.lastcp);
92da3157 5932 regcppop(rex, &maxopenparen);
c476f425
DM
5933 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5934 cur_curlyx->u.curlyx.count--;
5935 CACHEsayNO;
118e2215 5936 assert(0); /* NOTREACHED */
c476f425
DM
5937
5938 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5939 REGCP_UNWIND(ST.lastcp);
92da3157 5940 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
5941 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5942 "%*s whilem: failed, trying continuation...\n",
5943 REPORT_CODE_OFF+depth*2, "")
5944 );
5945 do_whilem_B_max:
5946 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5947 && ckWARN(WARN_REGEXP)
39819bd9 5948 && !reginfo->warned)
c476f425 5949 {
39819bd9 5950 reginfo->warned = TRUE;
dcbac5bb
FC
5951 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5952 "Complex regular subexpression recursion limit (%d) "
5953 "exceeded",
c476f425
DM
5954 REG_INFTY - 1);
5955 }
5956
5957 /* now try B */
5958 ST.save_curlyx = cur_curlyx;
5959 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
5960 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5961 locinput);
118e2215 5962 assert(0); /* NOTREACHED */
c476f425
DM
5963
5964 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5965 cur_curlyx = ST.save_curlyx;
5966 REGCP_UNWIND(ST.lastcp);
92da3157 5967 regcppop(rex, &maxopenparen);
c476f425 5968
d02d6d97 5969 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
5970 /* Maximum greed exceeded */
5971 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5972 && ckWARN(WARN_REGEXP)
39819bd9 5973 && !reginfo->warned)
c476f425 5974 {
39819bd9 5975 reginfo->warned = TRUE;
c476f425 5976 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
5977 "Complex regular subexpression recursion "
5978 "limit (%d) exceeded",
c476f425 5979 REG_INFTY - 1);
a0d0e21e 5980 }
c476f425 5981 cur_curlyx->u.curlyx.count--;
3ab3c9b4 5982 CACHEsayNO;
a0d0e21e 5983 }
c476f425
DM
5984
5985 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5986 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5987 );
5988 /* Try grabbing another A and see if it helps. */
c476f425 5989 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
5990 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5991 maxopenparen);
c476f425 5992 REGCP_SET(ST.lastcp);
d02d6d97 5993 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
5994 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5995 locinput);
118e2215 5996 assert(0); /* NOTREACHED */
40a82448
DM
5997
5998#undef ST
5999#define ST st->u.branch
6000
6001 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
6002 next = scan + ARG(scan);
6003 if (next == scan)
6004 next = NULL;
40a82448 6005 scan = NEXTOPER(scan);
924ba076 6006 /* FALLTHROUGH */
c277df42 6007
40a82448
DM
6008 case BRANCH: /* /(...|A|...)/ */
6009 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 6010 ST.lastparen = rex->lastparen;
f6033a9d 6011 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
6012 ST.next_branch = next;
6013 REGCP_SET(ST.cp);
02db2b7b 6014
40a82448 6015 /* Now go into the branch */
5d458dd8 6016 if (has_cutgroup) {
4d5016e5 6017 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 6018 } else {
4d5016e5 6019 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 6020 }
118e2215 6021 assert(0); /* NOTREACHED */
3c0563b9
DM
6022
6023 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 6024 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 6025 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6026 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
118e2215 6027 assert(0); /* NOTREACHED */
3c0563b9 6028
5d458dd8
YO
6029 case CUTGROUP_next_fail:
6030 do_cutgroup = 1;
6031 no_final = 1;
6032 if (st->u.mark.mark_name)
6033 sv_commit = st->u.mark.mark_name;
6034 sayNO;
118e2215 6035 assert(0); /* NOTREACHED */
3c0563b9 6036
5d458dd8
YO
6037 case BRANCH_next:
6038 sayYES;
118e2215 6039 assert(0); /* NOTREACHED */
3c0563b9 6040
40a82448 6041 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
6042 if (do_cutgroup) {
6043 do_cutgroup = 0;
6044 no_final = 0;
6045 }
40a82448 6046 REGCP_UNWIND(ST.cp);
a8d1f4b4 6047 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
6048 scan = ST.next_branch;
6049 /* no more branches? */
5d458dd8
YO
6050 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
6051 DEBUG_EXECUTE_r({
6052 PerlIO_printf( Perl_debug_log,
6053 "%*s %sBRANCH failed...%s\n",
6054 REPORT_CODE_OFF+depth*2, "",
6055 PL_colors[4],
6056 PL_colors[5] );
6057 });
6058 sayNO_SILENT;
6059 }
40a82448 6060 continue; /* execute next BRANCH[J] op */
118e2215 6061 assert(0); /* NOTREACHED */
40a82448 6062
3c0563b9 6063 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 6064 minmod = 1;
a0d0e21e 6065 break;
40a82448
DM
6066
6067#undef ST
6068#define ST st->u.curlym
6069
6070 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
6071
6072 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 6073 * only a single backtracking state, no matter how many matches
40a82448
DM
6074 * there are in {m,n}. It relies on the pattern being constant
6075 * length, with no parens to influence future backrefs
6076 */
6077
6078 ST.me = scan;
dc45a647 6079 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 6080
f6033a9d
DM
6081 ST.lastparen = rex->lastparen;
6082 ST.lastcloseparen = rex->lastcloseparen;
6083
40a82448
DM
6084 /* if paren positive, emulate an OPEN/CLOSE around A */
6085 if (ST.me->flags) {
3b6647e0 6086 U32 paren = ST.me->flags;
92da3157
DM
6087 if (paren > maxopenparen)
6088 maxopenparen = paren;
c277df42 6089 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 6090 }
40a82448
DM
6091 ST.A = scan;
6092 ST.B = next;
6093 ST.alen = 0;
6094 ST.count = 0;
24d3c4a9
DM
6095 ST.minmod = minmod;
6096 minmod = 0;
40a82448
DM
6097 ST.c1 = CHRTEST_UNINIT;
6098 REGCP_SET(ST.cp);
6407bf3b 6099
40a82448
DM
6100 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6101 goto curlym_do_B;
6102
6103 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 6104 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
118e2215 6105 assert(0); /* NOTREACHED */
5f80c4cf 6106
40a82448 6107 case CURLYM_A: /* we've just matched an A */
40a82448
DM
6108 ST.count++;
6109 /* after first match, determine A's length: u.curlym.alen */
6110 if (ST.count == 1) {
ba44c216 6111 if (reginfo->is_utf8_target) {
c07e9d7b
DM
6112 char *s = st->locinput;
6113 while (s < locinput) {
40a82448
DM
6114 ST.alen++;
6115 s += UTF8SKIP(s);
6116 }
6117 }
6118 else {
c07e9d7b 6119 ST.alen = locinput - st->locinput;
40a82448
DM
6120 }
6121 if (ST.alen == 0)
6122 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6123 }
0cadcf80
DM
6124 DEBUG_EXECUTE_r(
6125 PerlIO_printf(Perl_debug_log,
40a82448 6126 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 6127 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 6128 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
6129 );
6130
0a4db386 6131 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 6132 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
6133 goto fake_end;
6134
c966426a
DM
6135 {
6136 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6137 if ( max == REG_INFTY || ST.count < max )
6138 goto curlym_do_A; /* try to match another A */
6139 }
40a82448 6140 goto curlym_do_B; /* try to match B */
5f80c4cf 6141
40a82448
DM
6142 case CURLYM_A_fail: /* just failed to match an A */
6143 REGCP_UNWIND(ST.cp);
0a4db386
YO
6144
6145 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6146 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 6147 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 6148 sayNO;
0cadcf80 6149
40a82448 6150 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
6151 if (ST.c1 == CHRTEST_UNINIT) {
6152 /* calculate c1 and c2 for possible match of 1st char
6153 * following curly */
6154 ST.c1 = ST.c2 = CHRTEST_VOID;
6155 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6156 regnode *text_node = ST.B;
6157 if (! HAS_TEXT(text_node))
6158 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
6159 /* this used to be
6160
6161 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6162
6163 But the former is redundant in light of the latter.
6164
6165 if this changes back then the macro for
6166 IS_TEXT and friends need to change.
6167 */
c74f6de9 6168 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 6169 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 6170 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 6171 reginfo))
c74f6de9
KW
6172 {
6173 sayNO;
6174 }
c277df42 6175 }
c277df42 6176 }
40a82448
DM
6177 }
6178
6179 DEBUG_EXECUTE_r(
6180 PerlIO_printf(Perl_debug_log,
6181 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 6182 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
6183 "", (IV)ST.count)
6184 );
c74f6de9 6185 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
6186 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6187 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6188 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6189 {
6190 /* simulate B failing */
6191 DEBUG_OPTIMISE_r(
6192 PerlIO_printf(Perl_debug_log,
33daa3a5 6193 "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
79a2a0e8
KW
6194 (int)(REPORT_CODE_OFF+(depth*2)),"",
6195 valid_utf8_to_uvchr((U8 *) locinput, NULL),
6196 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6197 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6198 );
6199 state_num = CURLYM_B_fail;
6200 goto reenter_switch;
6201 }
6202 }
6203 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
6204 /* simulate B failing */
6205 DEBUG_OPTIMISE_r(
6206 PerlIO_printf(Perl_debug_log,
33daa3a5 6207 "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
5400f398 6208 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
6209 (int) nextchr, ST.c1, ST.c2)
6210 );
5400f398
KW
6211 state_num = CURLYM_B_fail;
6212 goto reenter_switch;
6213 }
c74f6de9 6214 }
40a82448
DM
6215
6216 if (ST.me->flags) {
f6033a9d 6217 /* emulate CLOSE: mark current A as captured */
40a82448
DM
6218 I32 paren = ST.me->flags;
6219 if (ST.count) {
b93070ed 6220 rex->offs[paren].start
9d9163fb
DM
6221 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6222 rex->offs[paren].end = locinput - reginfo->strbeg;
f6033a9d
DM
6223 if ((U32)paren > rex->lastparen)
6224 rex->lastparen = paren;
6225 rex->lastcloseparen = paren;
c277df42 6226 }
40a82448 6227 else
b93070ed 6228 rex->offs[paren].end = -1;
0a4db386 6229 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 6230 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
6231 {
6232 if (ST.count)
6233 goto fake_end;
6234 else
6235 sayNO;
6236 }
c277df42 6237 }
0a4db386 6238
4d5016e5 6239 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
118e2215 6240 assert(0); /* NOTREACHED */
40a82448
DM
6241
6242 case CURLYM_B_fail: /* just failed to match a B */
6243 REGCP_UNWIND(ST.cp);
a8d1f4b4 6244 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 6245 if (ST.minmod) {
84d2fa14
HS
6246 I32 max = ARG2(ST.me);
6247 if (max != REG_INFTY && ST.count == max)
40a82448
DM
6248 sayNO;
6249 goto curlym_do_A; /* try to match a further A */
6250 }
6251 /* backtrack one A */
6252 if (ST.count == ARG1(ST.me) /* min */)
6253 sayNO;
6254 ST.count--;
7016d6eb 6255 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
6256 goto curlym_do_B; /* try to match B */
6257
c255a977
DM
6258#undef ST
6259#define ST st->u.curly
40a82448 6260
c255a977
DM
6261#define CURLY_SETPAREN(paren, success) \
6262 if (paren) { \
6263 if (success) { \
9d9163fb
DM
6264 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6265 rex->offs[paren].end = locinput - reginfo->strbeg; \
f6033a9d
DM
6266 if (paren > rex->lastparen) \
6267 rex->lastparen = paren; \
b93070ed 6268 rex->lastcloseparen = paren; \
c255a977 6269 } \
f6033a9d 6270 else { \
b93070ed 6271 rex->offs[paren].end = -1; \
f6033a9d
DM
6272 rex->lastparen = ST.lastparen; \
6273 rex->lastcloseparen = ST.lastcloseparen; \
6274 } \
c255a977
DM
6275 }
6276
b40a2c17 6277 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
6278 ST.paren = 0;
6279 ST.min = 0;
6280 ST.max = REG_INFTY;
a0d0e21e
LW
6281 scan = NEXTOPER(scan);
6282 goto repeat;
3c0563b9 6283
b40a2c17 6284 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
6285 ST.paren = 0;
6286 ST.min = 1;
6287 ST.max = REG_INFTY;
c277df42 6288 scan = NEXTOPER(scan);
c255a977 6289 goto repeat;
3c0563b9 6290
b40a2c17 6291 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
6292 ST.paren = scan->flags; /* Which paren to set */
6293 ST.lastparen = rex->lastparen;
f6033a9d 6294 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
6295 if (ST.paren > maxopenparen)
6296 maxopenparen = ST.paren;
c255a977
DM
6297 ST.min = ARG1(scan); /* min to match */
6298 ST.max = ARG2(scan); /* max to match */
0a4db386 6299 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6300 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6301 ST.min=1;
6302 ST.max=1;
6303 }
c255a977
DM
6304 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6305 goto repeat;
3c0563b9 6306
b40a2c17 6307 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
6308 ST.paren = 0;
6309 ST.min = ARG1(scan); /* min to match */
6310 ST.max = ARG2(scan); /* max to match */
6311 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 6312 repeat:
a0d0e21e
LW
6313 /*
6314 * Lookahead to avoid useless match attempts
6315 * when we know what character comes next.
c255a977 6316 *
5f80c4cf
JP
6317 * Used to only do .*x and .*?x, but now it allows
6318 * for )'s, ('s and (?{ ... })'s to be in the way
6319 * of the quantifier and the EXACT-like node. -- japhy
6320 */
6321
eb5c1be8 6322 assert(ST.min <= ST.max);
3337dfe3
KW
6323 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6324 ST.c1 = ST.c2 = CHRTEST_VOID;
6325 }
6326 else {
5f80c4cf
JP
6327 regnode *text_node = next;
6328
3dab1dad
YO
6329 if (! HAS_TEXT(text_node))
6330 FIND_NEXT_IMPT(text_node);
5f80c4cf 6331
9e137952 6332 if (! HAS_TEXT(text_node))
c255a977 6333 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 6334 else {
ee9b8eae 6335 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 6336 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 6337 }
c74f6de9 6338 else {
ee9b8eae
YO
6339
6340 /* Currently we only get here when
6341
6342 PL_rekind[OP(text_node)] == EXACT
6343
6344 if this changes back then the macro for IS_TEXT and
6345 friends need to change. */
79a2a0e8 6346 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 6347 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 6348 reginfo))
c74f6de9
KW
6349 {
6350 sayNO;
6351 }
6352 }
1aa99e6b 6353 }
bbce6d69 6354 }
c255a977
DM
6355
6356 ST.A = scan;
6357 ST.B = next;
24d3c4a9 6358 if (minmod) {
eb72505d 6359 char *li = locinput;
24d3c4a9 6360 minmod = 0;
984e6dd1 6361 if (ST.min &&
f9176b44 6362 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
984e6dd1 6363 < ST.min)
4633a7c4 6364 sayNO;
7016d6eb 6365 SET_locinput(li);
c255a977 6366 ST.count = ST.min;
c255a977
DM
6367 REGCP_SET(ST.cp);
6368 if (ST.c1 == CHRTEST_VOID)
6369 goto curly_try_B_min;
6370
6371 ST.oldloc = locinput;
6372
6373 /* set ST.maxpos to the furthest point along the
6374 * string that could possibly match */
6375 if (ST.max == REG_INFTY) {
220db18a 6376 ST.maxpos = reginfo->strend - 1;
f2ed9b32 6377 if (utf8_target)
c255a977
DM
6378 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6379 ST.maxpos--;
6380 }
f2ed9b32 6381 else if (utf8_target) {
c255a977
DM
6382 int m = ST.max - ST.min;
6383 for (ST.maxpos = locinput;
220db18a 6384 m >0 && ST.maxpos < reginfo->strend; m--)
c255a977
DM
6385 ST.maxpos += UTF8SKIP(ST.maxpos);
6386 }
6387 else {
6388 ST.maxpos = locinput + ST.max - ST.min;
220db18a
DM
6389 if (ST.maxpos >= reginfo->strend)
6390 ST.maxpos = reginfo->strend - 1;
c255a977
DM
6391 }
6392 goto curly_try_B_min_known;
6393
6394 }
6395 else {
eb72505d
DM
6396 /* avoid taking address of locinput, so it can remain
6397 * a register var */
6398 char *li = locinput;
f9176b44 6399 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
c255a977
DM
6400 if (ST.count < ST.min)
6401 sayNO;
7016d6eb 6402 SET_locinput(li);
c255a977
DM
6403 if ((ST.count > ST.min)
6404 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6405 {
6406 /* A{m,n} must come at the end of the string, there's
6407 * no point in backing off ... */
6408 ST.min = ST.count;
6409 /* ...except that $ and \Z can match before *and* after
6410 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6411 We may back off by one in this case. */
eb72505d 6412 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
6413 ST.min--;
6414 }
6415 REGCP_SET(ST.cp);
6416 goto curly_try_B_max;
6417 }
118e2215 6418 assert(0); /* NOTREACHED */
c255a977
DM
6419
6420
6421 case CURLY_B_min_known_fail:
6422 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 6423
c255a977 6424 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6425 if (ST.paren) {
6426 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6427 }
c255a977
DM
6428 /* Couldn't or didn't -- move forward. */
6429 ST.oldloc = locinput;
f2ed9b32 6430 if (utf8_target)
c255a977
DM
6431 locinput += UTF8SKIP(locinput);
6432 else
6433 locinput++;
6434 ST.count++;
6435 curly_try_B_min_known:
6436 /* find the next place where 'B' could work, then call B */
6437 {
6438 int n;
f2ed9b32 6439 if (utf8_target) {
c255a977
DM
6440 n = (ST.oldloc == locinput) ? 0 : 1;
6441 if (ST.c1 == ST.c2) {
c255a977 6442 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6443 while (locinput <= ST.maxpos
6444 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6445 {
6446 locinput += UTF8SKIP(locinput);
c255a977
DM
6447 n++;
6448 }
1aa99e6b
IH
6449 }
6450 else {
c255a977 6451 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
6452 while (locinput <= ST.maxpos
6453 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6454 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6455 {
6456 locinput += UTF8SKIP(locinput);
c255a977 6457 n++;
1aa99e6b 6458 }
0fe9bf95
IZ
6459 }
6460 }
5400f398 6461 else { /* Not utf8_target */
c255a977
DM
6462 if (ST.c1 == ST.c2) {
6463 while (locinput <= ST.maxpos &&
6464 UCHARAT(locinput) != ST.c1)
6465 locinput++;
bbce6d69 6466 }
c255a977
DM
6467 else {
6468 while (locinput <= ST.maxpos
6469 && UCHARAT(locinput) != ST.c1
6470 && UCHARAT(locinput) != ST.c2)
6471 locinput++;
a0ed51b3 6472 }
c255a977
DM
6473 n = locinput - ST.oldloc;
6474 }
6475 if (locinput > ST.maxpos)
6476 sayNO;
c255a977 6477 if (n) {
eb72505d
DM
6478 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6479 * at b; check that everything between oldloc and
6480 * locinput matches */
6481 char *li = ST.oldloc;
c255a977 6482 ST.count += n;
f9176b44 6483 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
4633a7c4 6484 sayNO;
eb72505d 6485 assert(n == REG_INFTY || locinput == li);
a0d0e21e 6486 }
c255a977 6487 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6488 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6489 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6490 goto fake_end;
6491 }
4d5016e5 6492 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 6493 }
118e2215 6494 assert(0); /* NOTREACHED */
c255a977
DM
6495
6496
6497 case CURLY_B_min_fail:
6498 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
6499
6500 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6501 if (ST.paren) {
6502 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6503 }
c255a977 6504 /* failed -- move forward one */
f73aaa43 6505 {
eb72505d 6506 char *li = locinput;
f9176b44 6507 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
f73aaa43
DM
6508 sayNO;
6509 }
eb72505d 6510 locinput = li;
f73aaa43
DM
6511 }
6512 {
c255a977 6513 ST.count++;
c255a977
DM
6514 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6515 ST.count > 0)) /* count overflow ? */
15272685 6516 {
c255a977
DM
6517 curly_try_B_min:
6518 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 6519 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6520 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
6521 goto fake_end;
6522 }
4d5016e5 6523 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
6524 }
6525 }
c74f6de9 6526 sayNO;
118e2215 6527 assert(0); /* NOTREACHED */
c255a977
DM
6528
6529
6530 curly_try_B_max:
6531 /* a successful greedy match: now try to match B */
40d049e4 6532 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 6533 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
6534 goto fake_end;
6535 }
c255a977 6536 {
220db18a 6537 bool could_match = locinput < reginfo->strend;
79a2a0e8 6538
c255a977 6539 /* If it could work, try it. */
79a2a0e8
KW
6540 if (ST.c1 != CHRTEST_VOID && could_match) {
6541 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6542 {
6543 could_match = memEQ(locinput,
6544 ST.c1_utf8,
6545 UTF8SKIP(locinput))
6546 || memEQ(locinput,
6547 ST.c2_utf8,
6548 UTF8SKIP(locinput));
6549 }
6550 else {
6551 could_match = UCHARAT(locinput) == ST.c1
6552 || UCHARAT(locinput) == ST.c2;
6553 }
6554 }
6555 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 6556 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 6557 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
118e2215 6558 assert(0); /* NOTREACHED */
c255a977
DM
6559 }
6560 }
924ba076 6561 /* FALLTHROUGH */
3c0563b9 6562
c255a977
DM
6563 case CURLY_B_max_fail:
6564 /* failed to find B in a greedy match */
c255a977
DM
6565
6566 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
6567 if (ST.paren) {
6568 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6569 }
c255a977
DM
6570 /* back up. */
6571 if (--ST.count < ST.min)
6572 sayNO;
eb72505d 6573 locinput = HOPc(locinput, -1);
c255a977
DM
6574 goto curly_try_B_max;
6575
6576#undef ST
6577
3c0563b9 6578 case END: /* last op of main pattern */
6bda09f9 6579 fake_end:
faec1544
DM
6580 if (cur_eval) {
6581 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544 6582
288b8c02 6583 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
6584
6585 /* Save *all* the positions. */
6586 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b 6587 rex_sv = cur_eval->u.eval.prev_rex;
aed7b151 6588 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 6589 SET_reg_curpm(rex_sv);
8d919b0a 6590 rex = ReANY(rex_sv);
f8fc2ecf 6591 rexi = RXi_GET(rex);
faec1544 6592 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 6593
faec1544 6594 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
6595
6596 /* Restore parens of the outer rex without popping the
6597 * savestack */
92da3157
DM
6598 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6599 &maxopenparen);
faec1544
DM
6600
6601 st->u.eval.prev_eval = cur_eval;
6602 cur_eval = cur_eval->u.eval.prev_eval;
6603 DEBUG_EXECUTE_r(
2a49f0f5
JH
6604 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6605 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
6606 if ( nochange_depth )
6607 nochange_depth--;
6608
4d5016e5
DM
6609 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6610 locinput); /* match B */
faec1544
DM
6611 }
6612
3b0527fe 6613 if (locinput < reginfo->till) {
a3621e74 6614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
6615 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6616 PL_colors[4],
6d59b646
DM
6617 (long)(locinput - startpos),
6618 (long)(reginfo->till - startpos),
7821416a 6619 PL_colors[5]));
58e23c8d 6620
262b90c4 6621 sayNO_SILENT; /* Cannot match: too short. */
7821416a 6622 }
262b90c4 6623 sayYES; /* Success! */
dad79028
DM
6624
6625 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6626 DEBUG_EXECUTE_r(
6627 PerlIO_printf(Perl_debug_log,
6628 "%*s %ssubpattern success...%s\n",
5bc10b2c 6629 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 6630 sayYES; /* Success! */
dad79028 6631
40a82448
DM
6632#undef ST
6633#define ST st->u.ifmatch
6634
37f53970
DM
6635 {
6636 char *newstart;
6637
40a82448
DM
6638 case SUSPEND: /* (?>A) */
6639 ST.wanted = 1;
37f53970 6640 newstart = locinput;
9041c2e3 6641 goto do_ifmatch;
dad79028 6642
40a82448
DM
6643 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6644 ST.wanted = 0;
dad79028
DM
6645 goto ifmatch_trivial_fail_test;
6646
40a82448
DM
6647 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6648 ST.wanted = 1;
dad79028 6649 ifmatch_trivial_fail_test:
a0ed51b3 6650 if (scan->flags) {
52657f30 6651 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
6652 if (!s) {
6653 /* trivial fail */
24d3c4a9
DM
6654 if (logical) {
6655 logical = 0;
f2338a2e 6656 sw = 1 - cBOOL(ST.wanted);
dad79028 6657 }
40a82448 6658 else if (ST.wanted)
dad79028
DM
6659 sayNO;
6660 next = scan + ARG(scan);
6661 if (next == scan)
6662 next = NULL;
6663 break;
6664 }
37f53970 6665 newstart = s;
a0ed51b3
LW
6666 }
6667 else
37f53970 6668 newstart = locinput;
a0ed51b3 6669
c277df42 6670 do_ifmatch:
40a82448 6671 ST.me = scan;
24d3c4a9 6672 ST.logical = logical;
24d786f4
YO
6673 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6674
40a82448 6675 /* execute body of (?...A) */
37f53970 6676 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
118e2215 6677 assert(0); /* NOTREACHED */
37f53970 6678 }
40a82448
DM
6679
6680 case IFMATCH_A_fail: /* body of (?...A) failed */
6681 ST.wanted = !ST.wanted;
924ba076 6682 /* FALLTHROUGH */
40a82448
DM
6683
6684 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 6685 if (ST.logical) {
f2338a2e 6686 sw = cBOOL(ST.wanted);
40a82448
DM
6687 }
6688 else if (!ST.wanted)
6689 sayNO;
6690
37f53970
DM
6691 if (OP(ST.me) != SUSPEND) {
6692 /* restore old position except for (?>...) */
6693 locinput = st->locinput;
40a82448
DM
6694 }
6695 scan = ST.me + ARG(ST.me);
6696 if (scan == ST.me)
6697 scan = NULL;
6698 continue; /* execute B */
6699
6700#undef ST
dad79028 6701
3c0563b9
DM
6702 case LONGJMP: /* alternative with many branches compiles to
6703 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
6704 next = scan + ARG(scan);
6705 if (next == scan)
6706 next = NULL;
a0d0e21e 6707 break;
3c0563b9
DM
6708
6709 case COMMIT: /* (*COMMIT) */
220db18a 6710 reginfo->cutpoint = reginfo->strend;
e2e6a0f1 6711 /* FALLTHROUGH */
3c0563b9
DM
6712
6713 case PRUNE: /* (*PRUNE) */
e2e6a0f1 6714 if (!scan->flags)
ad64d0ec 6715 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 6716 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
118e2215 6717 assert(0); /* NOTREACHED */
3c0563b9 6718
54612592
YO
6719 case COMMIT_next_fail:
6720 no_final = 1;
6721 /* FALLTHROUGH */
3c0563b9
DM
6722
6723 case OPFAIL: /* (*FAIL) */
7f69552c 6724 sayNO;
118e2215 6725 assert(0); /* NOTREACHED */
e2e6a0f1
YO
6726
6727#define ST st->u.mark
3c0563b9 6728 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 6729 ST.prev_mark = mark_state;
5d458dd8 6730 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 6731 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 6732 mark_state = st;
4d5016e5
DM
6733 ST.mark_loc = locinput;
6734 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
118e2215 6735 assert(0); /* NOTREACHED */
3c0563b9 6736
e2e6a0f1
YO
6737 case MARKPOINT_next:
6738 mark_state = ST.prev_mark;
6739 sayYES;
118e2215 6740 assert(0); /* NOTREACHED */
3c0563b9 6741
e2e6a0f1 6742 case MARKPOINT_next_fail:
5d458dd8 6743 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
6744 {
6745 if (ST.mark_loc > startpoint)
6746 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6747 popmark = NULL; /* we found our mark */
6748 sv_commit = ST.mark_name;
6749
6750 DEBUG_EXECUTE_r({
5d458dd8 6751 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
6752 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6753 REPORT_CODE_OFF+depth*2, "",
be2597df 6754 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
6755 });
6756 }
6757 mark_state = ST.prev_mark;
5d458dd8
YO
6758 sv_yes_mark = mark_state ?
6759 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 6760 sayNO;
118e2215 6761 assert(0); /* NOTREACHED */
3c0563b9
DM
6762
6763 case SKIP: /* (*SKIP) */
5d458dd8 6764 if (scan->flags) {
2bf803e2 6765 /* (*SKIP) : if we fail we cut here*/
5d458dd8 6766 ST.mark_name = NULL;
e2e6a0f1 6767 ST.mark_loc = locinput;
4d5016e5 6768 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 6769 } else {
2bf803e2 6770 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
6771 otherwise do nothing. Meaning we need to scan
6772 */
6773 regmatch_state *cur = mark_state;
ad64d0ec 6774 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
6775
6776 while (cur) {
6777 if ( sv_eq( cur->u.mark.mark_name,
6778 find ) )
6779 {
6780 ST.mark_name = find;
4d5016e5 6781 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
6782 }
6783 cur = cur->u.mark.prev_mark;
6784 }
e2e6a0f1 6785 }
2bf803e2 6786 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 6787 break;
3c0563b9 6788
5d458dd8
YO
6789 case SKIP_next_fail:
6790 if (ST.mark_name) {
6791 /* (*CUT:NAME) - Set up to search for the name as we
6792 collapse the stack*/
6793 popmark = ST.mark_name;
6794 } else {
6795 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
6796 if (ST.mark_loc > startpoint)
6797 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
6798 /* but we set sv_commit to latest mark_name if there
6799 is one so they can test to see how things lead to this
6800 cut */
6801 if (mark_state)
6802 sv_commit=mark_state->u.mark.mark_name;
6803 }
e2e6a0f1
YO
6804 no_final = 1;
6805 sayNO;
118e2215 6806 assert(0); /* NOTREACHED */
e2e6a0f1 6807#undef ST
3c0563b9
DM
6808
6809 case LNBREAK: /* \R */
220db18a 6810 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
e1d1eefb 6811 locinput += n;
e1d1eefb
YO
6812 } else
6813 sayNO;
6814 break;
6815
a0d0e21e 6816 default:
b900a521 6817 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 6818 PTR2UV(scan), OP(scan));
cea2e8a9 6819 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
6820
6821 /* this is a point to jump to in order to increment
6822 * locinput by one character */
6823 increment_locinput:
e6ca698c 6824 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
6825 if (utf8_target) {
6826 locinput += PL_utf8skip[nextchr];
7016d6eb 6827 /* locinput is allowed to go 1 char off the end, but not 2+ */
220db18a 6828 if (locinput > reginfo->strend)
28b98f76 6829 sayNO;
28b98f76
DM
6830 }
6831 else
3640db6b 6832 locinput++;
28b98f76 6833 break;
5d458dd8
YO
6834
6835 } /* end switch */
95b24440 6836
5d458dd8
YO
6837 /* switch break jumps here */
6838 scan = next; /* prepare to execute the next op and ... */
6839 continue; /* ... jump back to the top, reusing st */
118e2215 6840 assert(0); /* NOTREACHED */
95b24440 6841
40a82448
DM
6842 push_yes_state:
6843 /* push a state that backtracks on success */
6844 st->u.yes.prev_yes_state = yes_state;
6845 yes_state = st;
924ba076 6846 /* FALLTHROUGH */
40a82448
DM
6847 push_state:
6848 /* push a new regex state, then continue at scan */
6849 {
6850 regmatch_state *newst;
6851
24b23f37
YO
6852 DEBUG_STACK_r({
6853 regmatch_state *cur = st;
6854 regmatch_state *curyes = yes_state;
6855 int curd = depth;
6856 regmatch_slab *slab = PL_regmatch_slab;
6857 for (;curd > -1;cur--,curd--) {
6858 if (cur < SLAB_FIRST(slab)) {
6859 slab = slab->prev;
6860 cur = SLAB_LAST(slab);
6861 }
6862 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6863 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 6864 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
6865 (curyes == cur) ? "yes" : ""
6866 );
6867 if (curyes == cur)
6868 curyes = cur->u.yes.prev_yes_state;
6869 }
6870 } else
6871 DEBUG_STATE_pp("push")
6872 );
40a82448 6873 depth++;
40a82448
DM
6874 st->locinput = locinput;
6875 newst = st+1;
6876 if (newst > SLAB_LAST(PL_regmatch_slab))
6877 newst = S_push_slab(aTHX);
6878 PL_regmatch_state = newst;
786e8c11 6879
4d5016e5 6880 locinput = pushinput;
40a82448
DM
6881 st = newst;
6882 continue;
118e2215 6883 assert(0); /* NOTREACHED */
40a82448 6884 }
a0d0e21e 6885 }
a687059c 6886
a0d0e21e
LW
6887 /*
6888 * We get here only if there's trouble -- normally "case END" is
6889 * the terminating point.
6890 */
cea2e8a9 6891 Perl_croak(aTHX_ "corrupted regexp pointers");
a0d0e21e 6892 /*NOTREACHED*/
4633a7c4
LW
6893 sayNO;
6894
262b90c4 6895yes:
77cb431f
DM
6896 if (yes_state) {
6897 /* we have successfully completed a subexpression, but we must now
6898 * pop to the state marked by yes_state and continue from there */
77cb431f 6899 assert(st != yes_state);
5bc10b2c
DM
6900#ifdef DEBUGGING
6901 while (st != yes_state) {
6902 st--;
6903 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6904 PL_regmatch_slab = PL_regmatch_slab->prev;
6905 st = SLAB_LAST(PL_regmatch_slab);
6906 }
e2e6a0f1 6907 DEBUG_STATE_r({
54612592
YO
6908 if (no_final) {
6909 DEBUG_STATE_pp("pop (no final)");
6910 } else {
6911 DEBUG_STATE_pp("pop (yes)");
6912 }
e2e6a0f1 6913 });
5bc10b2c
DM
6914 depth--;
6915 }
6916#else
77cb431f
DM
6917 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6918 || yes_state > SLAB_LAST(PL_regmatch_slab))
6919 {
6920 /* not in this slab, pop slab */
6921 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6922 PL_regmatch_slab = PL_regmatch_slab->prev;
6923 st = SLAB_LAST(PL_regmatch_slab);
6924 }
6925 depth -= (st - yes_state);
5bc10b2c 6926#endif
77cb431f
DM
6927 st = yes_state;
6928 yes_state = st->u.yes.prev_yes_state;
6929 PL_regmatch_state = st;
24b23f37 6930
3640db6b 6931 if (no_final)
5d458dd8 6932 locinput= st->locinput;
54612592 6933 state_num = st->resume_state + no_final;
24d3c4a9 6934 goto reenter_switch;
77cb431f
DM
6935 }
6936
a3621e74 6937 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 6938 PL_colors[4], PL_colors[5]));
02db2b7b 6939
bf2039a9 6940 if (reginfo->info_aux_eval) {
19b95bf0
DM
6941 /* each successfully executed (?{...}) block does the equivalent of
6942 * local $^R = do {...}
6943 * When popping the save stack, all these locals would be undone;
6944 * bypass this by setting the outermost saved $^R to the latest
6945 * value */
4b22688e
YO
6946 /* I dont know if this is needed or works properly now.
6947 * see code related to PL_replgv elsewhere in this file.
6948 * Yves
6949 */
19b95bf0
DM
6950 if (oreplsv != GvSV(PL_replgv))
6951 sv_setsv(oreplsv, GvSV(PL_replgv));
6952 }
95b24440 6953 result = 1;
aa283a38 6954 goto final_exit;
4633a7c4
LW
6955
6956no:
a3621e74 6957 DEBUG_EXECUTE_r(
7821416a 6958 PerlIO_printf(Perl_debug_log,
786e8c11 6959 "%*s %sfailed...%s\n",
5bc10b2c 6960 REPORT_CODE_OFF+depth*2, "",
786e8c11 6961 PL_colors[4], PL_colors[5])
7821416a 6962 );
aa283a38 6963
262b90c4 6964no_silent:
54612592
YO
6965 if (no_final) {
6966 if (yes_state) {
6967 goto yes;
6968 } else {
6969 goto final_exit;
6970 }
6971 }
aa283a38
DM
6972 if (depth) {
6973 /* there's a previous state to backtrack to */
40a82448
DM
6974 st--;
6975 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6976 PL_regmatch_slab = PL_regmatch_slab->prev;
6977 st = SLAB_LAST(PL_regmatch_slab);
6978 }
6979 PL_regmatch_state = st;
40a82448 6980 locinput= st->locinput;
40a82448 6981
5bc10b2c
DM
6982 DEBUG_STATE_pp("pop");
6983 depth--;
262b90c4
DM
6984 if (yes_state == st)
6985 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 6986
24d3c4a9
DM
6987 state_num = st->resume_state + 1; /* failure = success + 1 */
6988 goto reenter_switch;
95b24440 6989 }
24d3c4a9 6990 result = 0;
aa283a38 6991
262b90c4 6992 final_exit:
bbe252da 6993 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
6994 SV *sv_err = get_sv("REGERROR", 1);
6995 SV *sv_mrk = get_sv("REGMARK", 1);
6996 if (result) {
e2e6a0f1 6997 sv_commit = &PL_sv_no;
5d458dd8
YO
6998 if (!sv_yes_mark)
6999 sv_yes_mark = &PL_sv_yes;
7000 } else {
7001 if (!sv_commit)
7002 sv_commit = &PL_sv_yes;
7003 sv_yes_mark = &PL_sv_no;
7004 }
316ebaf2
JH
7005 assert(sv_err);
7006 assert(sv_mrk);
5d458dd8
YO
7007 sv_setsv(sv_err, sv_commit);
7008 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 7009 }
19b95bf0 7010
81ed78b2
DM
7011
7012 if (last_pushed_cv) {
7013 dSP;
7014 POP_MULTICALL;
4f8dbb2d 7015 PERL_UNUSED_VAR(SP);
81ed78b2
DM
7016 }
7017
9d9163fb
DM
7018 assert(!result || locinput - reginfo->strbeg >= 0);
7019 return result ? locinput - reginfo->strbeg : -1;
a687059c
LW
7020}
7021
7022/*
7023 - regrepeat - repeatedly match something simple, report how many
d60de1d1 7024 *
e64f369d
KW
7025 * What 'simple' means is a node which can be the operand of a quantifier like
7026 * '+', or {1,3}
7027 *
d60de1d1
DM
7028 * startposp - pointer a pointer to the start position. This is updated
7029 * to point to the byte following the highest successful
7030 * match.
7031 * p - the regnode to be repeatedly matched against.
220db18a 7032 * reginfo - struct holding match state, such as strend
4063ade8 7033 * max - maximum number of things to match.
d60de1d1 7034 * depth - (for debugging) backtracking depth.
a687059c 7035 */
76e3520e 7036STATIC I32
272d35c9 7037S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
f9176b44 7038 regmatch_info *const reginfo, I32 max, int depth)
a687059c 7039{
27da23d5 7040 dVAR;
4063ade8 7041 char *scan; /* Pointer to current position in target string */
eb578fdb 7042 I32 c;
220db18a 7043 char *loceol = reginfo->strend; /* local version */
4063ade8 7044 I32 hardcount = 0; /* How many matches so far */
ba44c216 7045 bool utf8_target = reginfo->is_utf8_target;
3018b823 7046 int to_complement = 0; /* Invert the result? */
d513472c 7047 UV utf8_flags;
3018b823 7048 _char_class_number classnum;
4f55667c
SP
7049#ifndef DEBUGGING
7050 PERL_UNUSED_ARG(depth);
7051#endif
a0d0e21e 7052
7918f24d
NC
7053 PERL_ARGS_ASSERT_REGREPEAT;
7054
f73aaa43 7055 scan = *startposp;
faf11cac
HS
7056 if (max == REG_INFTY)
7057 max = I32_MAX;
dfb8f192 7058 else if (! utf8_target && loceol - scan > max)
7f596f4c 7059 loceol = scan + max;
4063ade8
KW
7060
7061 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7062 * to the maximum of how far we should go in it (leaving it set to the real
7063 * end, if the maximum permissible would take us beyond that). This allows
7064 * us to make the loop exit condition that we haven't gone past <loceol> to
7065 * also mean that we haven't exceeded the max permissible count, saving a
7066 * test each time through the loop. But it assumes that the OP matches a
7067 * single byte, which is true for most of the OPs below when applied to a
7068 * non-UTF-8 target. Those relatively few OPs that don't have this
7069 * characteristic will have to compensate.
7070 *
7071 * There is no adjustment for UTF-8 targets, as the number of bytes per
7072 * character varies. OPs will have to test both that the count is less
7073 * than the max permissible (using <hardcount> to keep track), and that we
7074 * are still within the bounds of the string (using <loceol>. A few OPs
7075 * match a single byte no matter what the encoding. They can omit the max
7076 * test if, for the UTF-8 case, they do the adjustment that was skipped
7077 * above.
7078 *
7079 * Thus, the code above sets things up for the common case; and exceptional
7080 * cases need extra work; the common case is to make sure <scan> doesn't
7081 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7082 * count doesn't exceed the maximum permissible */
7083
a0d0e21e 7084 switch (OP(p)) {
22c35a8c 7085 case REG_ANY:
f2ed9b32 7086 if (utf8_target) {
1aa99e6b 7087 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
7088 scan += UTF8SKIP(scan);
7089 hardcount++;
7090 }
7091 } else {
7092 while (scan < loceol && *scan != '\n')
7093 scan++;
a0ed51b3
LW
7094 }
7095 break;
ffc61ed2 7096 case SANY:
f2ed9b32 7097 if (utf8_target) {
a0804c9e 7098 while (scan < loceol && hardcount < max) {
def8e4ea
JH
7099 scan += UTF8SKIP(scan);
7100 hardcount++;
7101 }
7102 }
7103 else
7104 scan = loceol;
a0ed51b3 7105 break;
4063ade8 7106 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
9597860a 7107 if (utf8_target && loceol - scan > max) {
4063ade8
KW
7108
7109 /* <loceol> hadn't been adjusted in the UTF-8 case */
7110 scan += max;
7111 }
7112 else {
7113 scan = loceol;
7114 }
f33976b4 7115 break;
59d32103 7116 case EXACT:
f9176b44 7117 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 7118
59d32103 7119 c = (U8)*STRING(p);
59d32103 7120
5e4a1da1
KW
7121 /* Can use a simple loop if the pattern char to match on is invariant
7122 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
7123 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7124 * true iff it doesn't matter if the argument is in UTF-8 or not */
f9176b44 7125 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
e9369824 7126 if (utf8_target && loceol - scan > max) {
4063ade8
KW
7127 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7128 * since here, to match at all, 1 char == 1 byte */
7129 loceol = scan + max;
7130 }
59d32103
KW
7131 while (scan < loceol && UCHARAT(scan) == c) {
7132 scan++;
7133 }
7134 }
f9176b44 7135 else if (reginfo->is_utf8_pat) {
5e4a1da1
KW
7136 if (utf8_target) {
7137 STRLEN scan_char_len;
5e4a1da1 7138
4063ade8 7139 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
7140 * string EQ */
7141 while (hardcount < max
9a902117
KW
7142 && scan < loceol
7143 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
7144 && memEQ(scan, STRING(p), scan_char_len))
7145 {
4200a00c 7146 scan += scan_char_len;
5e4a1da1
KW
7147 hardcount++;
7148 }
7149 }
7150 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 7151
5e4a1da1
KW
7152 /* Target isn't utf8; convert the character in the UTF-8
7153 * pattern to non-UTF8, and do a simple loop */
94bb8c36 7154 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
5e4a1da1
KW
7155 while (scan < loceol && UCHARAT(scan) == c) {
7156 scan++;
7157 }
7158 } /* else pattern char is above Latin1, can't possibly match the
7159 non-UTF-8 target */
b40a2c17 7160 }
5e4a1da1 7161 else {
59d32103 7162
5e4a1da1
KW
7163 /* Here, the string must be utf8; pattern isn't, and <c> is
7164 * different in utf8 than not, so can't compare them directly.
7165 * Outside the loop, find the two utf8 bytes that represent c, and
7166 * then look for those in sequence in the utf8 string */
59d32103
KW
7167 U8 high = UTF8_TWO_BYTE_HI(c);
7168 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
7169
7170 while (hardcount < max
7171 && scan + 1 < loceol
7172 && UCHARAT(scan) == high
7173 && UCHARAT(scan + 1) == low)
7174 {
7175 scan += 2;
7176 hardcount++;
7177 }
7178 }
7179 break;
5e4a1da1 7180
098b07d5
KW
7181 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
7182 assert(! reginfo->is_utf8_pat);
924ba076 7183 /* FALLTHROUGH */
2f7f8cb1 7184 case EXACTFA:
098b07d5 7185 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
7186 goto do_exactf;
7187
d4e0b827 7188 case EXACTFL:
cea315b6 7189 utf8_flags = FOLDEQ_LOCALE;
17580e7a
KW
7190 goto do_exactf;
7191
2fdb7295
KW
7192 case EXACTF: /* This node only generated for non-utf8 patterns */
7193 assert(! reginfo->is_utf8_pat);
098b07d5
KW
7194 utf8_flags = 0;
7195 goto do_exactf;
62bf7766 7196
3c760661 7197 case EXACTFU_SS:
9a5a5549 7198 case EXACTFU:
f9176b44 7199 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 7200
613a425d
KW
7201 do_exactf: {
7202 int c1, c2;
7203 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 7204
f9176b44 7205 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 7206
984e6dd1 7207 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
aed7b151 7208 reginfo))
984e6dd1 7209 {
613a425d 7210 if (c1 == CHRTEST_VOID) {
49b95fad 7211 /* Use full Unicode fold matching */
220db18a 7212 char *tmpeol = reginfo->strend;
f9176b44 7213 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
7214 while (hardcount < max
7215 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7216 STRING(p), NULL, pat_len,
f9176b44 7217 reginfo->is_utf8_pat, utf8_flags))
49b95fad
KW
7218 {
7219 scan = tmpeol;
220db18a 7220 tmpeol = reginfo->strend;
49b95fad
KW
7221 hardcount++;
7222 }
613a425d
KW
7223 }
7224 else if (utf8_target) {
7225 if (c1 == c2) {
4063ade8
KW
7226 while (scan < loceol
7227 && hardcount < max
613a425d
KW
7228 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7229 {
7230 scan += UTF8SKIP(scan);
7231 hardcount++;
7232 }
7233 }
7234 else {
4063ade8
KW
7235 while (scan < loceol
7236 && hardcount < max
613a425d
KW
7237 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7238 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7239 {
7240 scan += UTF8SKIP(scan);
7241 hardcount++;
7242 }
7243 }
7244 }
7245 else if (c1 == c2) {
7246 while (scan < loceol && UCHARAT(scan) == c1) {
7247 scan++;
7248 }
7249 }
7250 else {
7251 while (scan < loceol &&
7252 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7253 {
7254 scan++;
7255 }
7256 }
634c83a2 7257 }
bbce6d69 7258 break;
613a425d 7259 }
a0d0e21e 7260 case ANYOF:
e0193e47 7261 if (utf8_target) {
4e8910e0 7262 while (hardcount < max
9a902117 7263 && scan < loceol
3db24e1e 7264 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
4e8910e0 7265 {
9a902117 7266 scan += UTF8SKIP(scan);
ffc61ed2
JH
7267 hardcount++;
7268 }
7269 } else {
32fc9b6a 7270 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
7271 scan++;
7272 }
a0d0e21e 7273 break;
4063ade8 7274
3018b823 7275 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 7276
3018b823
KW
7277 case NPOSIXL:
7278 to_complement = 1;
7279 /* FALLTHROUGH */
980866de 7280
3018b823 7281 case POSIXL:
3018b823
KW
7282 if (! utf8_target) {
7283 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7284 *scan)))
a12cf05f 7285 {
3018b823
KW
7286 scan++;
7287 }
7288 } else {
7289 while (hardcount < max && scan < loceol
7290 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7291 (U8 *) scan)))
7292 {
7293 scan += UTF8SKIP(scan);
ffc61ed2
JH
7294 hardcount++;
7295 }
a0ed51b3
LW
7296 }
7297 break;
0658cdde 7298
3018b823
KW
7299 case POSIXD:
7300 if (utf8_target) {
7301 goto utf8_posix;
7302 }
7303 /* FALLTHROUGH */
7304
0658cdde 7305 case POSIXA:
0430522f 7306 if (utf8_target && loceol - scan > max) {
4063ade8 7307
7aee35ff
KW
7308 /* We didn't adjust <loceol> at the beginning of this routine
7309 * because is UTF-8, but it is actually ok to do so, since here, to
7310 * match, 1 char == 1 byte. */
4063ade8
KW
7311 loceol = scan + max;
7312 }
7313 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
7314 scan++;
7315 }
7316 break;
980866de 7317
3018b823
KW
7318 case NPOSIXD:
7319 if (utf8_target) {
7320 to_complement = 1;
7321 goto utf8_posix;
7322 }
924ba076 7323 /* FALLTHROUGH */
980866de 7324
3018b823
KW
7325 case NPOSIXA:
7326 if (! utf8_target) {
7327 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
7328 scan++;
7329 }
4063ade8 7330 }
3018b823 7331 else {
980866de 7332
3018b823 7333 /* The complement of something that matches only ASCII matches all
837226c8 7334 * non-ASCII, plus everything in ASCII that isn't in the class. */
bedac28b 7335 while (hardcount < max && scan < loceol
837226c8 7336 && (! isASCII_utf8(scan)
3018b823 7337 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 7338 {
3018b823 7339 scan += UTF8SKIP(scan);
ffc61ed2
JH
7340 hardcount++;
7341 }
3018b823
KW
7342 }
7343 break;
980866de 7344
3018b823
KW
7345 case NPOSIXU:
7346 to_complement = 1;
7347 /* FALLTHROUGH */
7348
7349 case POSIXU:
7350 if (! utf8_target) {
7351 while (scan < loceol && to_complement
7352 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 7353 {
3018b823
KW
7354 scan++;
7355 }
cfaf538b
KW
7356 }
7357 else {
3018b823
KW
7358 utf8_posix:
7359 classnum = (_char_class_number) FLAGS(p);
7360 if (classnum < _FIRST_NON_SWASH_CC) {
7361
7362 /* Here, a swash is needed for above-Latin1 code points.
7363 * Process as many Latin1 code points using the built-in rules.
7364 * Go to another loop to finish processing upon encountering
7365 * the first Latin1 code point. We could do that in this loop
7366 * as well, but the other way saves having to test if the swash
7367 * has been loaded every time through the loop: extra space to
7368 * save a test. */
7369 while (hardcount < max && scan < loceol) {
7370 if (UTF8_IS_INVARIANT(*scan)) {
7371 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7372 classnum))))
7373 {
7374 break;
7375 }
7376 scan++;
7377 }
7378 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7379 if (! (to_complement
94bb8c36
KW
7380 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7381 *(scan + 1)),
3018b823
KW
7382 classnum))))
7383 {
7384 break;
7385 }
7386 scan += 2;
7387 }
7388 else {
7389 goto found_above_latin1;
7390 }
7391
7392 hardcount++;
7393 }
7394 }
7395 else {
7396 /* For these character classes, the knowledge of how to handle
7397 * every code point is compiled in to Perl via a macro. This
7398 * code is written for making the loops as tight as possible.
7399 * It could be refactored to save space instead */
7400 switch (classnum) {
7401 case _CC_ENUM_SPACE: /* XXX would require separate code
7402 if we revert the change of \v
7403 matching this */
924ba076 7404 /* FALLTHROUGH */
3018b823
KW
7405 case _CC_ENUM_PSXSPC:
7406 while (hardcount < max
7407 && scan < loceol
7408 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7409 {
7410 scan += UTF8SKIP(scan);
7411 hardcount++;
7412 }
7413 break;
7414 case _CC_ENUM_BLANK:
7415 while (hardcount < max
7416 && scan < loceol
7417 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7418 {
7419 scan += UTF8SKIP(scan);
7420 hardcount++;
7421 }
7422 break;
7423 case _CC_ENUM_XDIGIT:
7424 while (hardcount < max
7425 && scan < loceol
7426 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7427 {
7428 scan += UTF8SKIP(scan);
7429 hardcount++;
7430 }
7431 break;
7432 case _CC_ENUM_VERTSPACE:
7433 while (hardcount < max
7434 && scan < loceol
7435 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7436 {
7437 scan += UTF8SKIP(scan);
7438 hardcount++;
7439 }
7440 break;
7441 case _CC_ENUM_CNTRL:
7442 while (hardcount < max
7443 && scan < loceol
7444 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7445 {
7446 scan += UTF8SKIP(scan);
7447 hardcount++;
7448 }
7449 break;
7450 default:
7451 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7452 }
7453 }
a0ed51b3 7454 }
3018b823 7455 break;
4063ade8 7456
3018b823
KW
7457 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7458
7459 /* Load the swash if not already present */
7460 if (! PL_utf8_swash_ptrs[classnum]) {
7461 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7462 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
2a16ac92
KW
7463 "utf8",
7464 "",
7465 &PL_sv_undef, 1, 0,
7466 PL_XPosix_ptrs[classnum], &flags);
4063ade8 7467 }
3018b823
KW
7468
7469 while (hardcount < max && scan < loceol
7470 && to_complement ^ cBOOL(_generic_utf8(
7471 classnum,
7472 scan,
7473 swash_fetch(PL_utf8_swash_ptrs[classnum],
7474 (U8 *) scan,
7475 TRUE))))
7476 {
7477 scan += UTF8SKIP(scan);
7478 hardcount++;
7479 }
7480 break;
7481
e1d1eefb 7482 case LNBREAK:
e64f369d
KW
7483 if (utf8_target) {
7484 while (hardcount < max && scan < loceol &&
7485 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7486 scan += c;
7487 hardcount++;
7488 }
7489 } else {
7490 /* LNBREAK can match one or two latin chars, which is ok, but we
7491 * have to use hardcount in this situation, and throw away the
7492 * adjustment to <loceol> done before the switch statement */
220db18a 7493 loceol = reginfo->strend;
e64f369d
KW
7494 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7495 scan+=c;
7496 hardcount++;
7497 }
7498 }
7499 break;
e1d1eefb 7500
584b1f02
KW
7501 case BOUND:
7502 case BOUNDA:
7503 case BOUNDL:
7504 case BOUNDU:
7505 case EOS:
7506 case GPOS:
7507 case KEEPS:
7508 case NBOUND:
7509 case NBOUNDA:
7510 case NBOUNDL:
7511 case NBOUNDU:
7512 case OPFAIL:
7513 case SBOL:
7514 case SEOL:
7515 /* These are all 0 width, so match right here or not at all. */
7516 break;
7517
7518 default:
7519 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7520 assert(0); /* NOTREACHED */
7521
a0d0e21e 7522 }
a687059c 7523
a0ed51b3
LW
7524 if (hardcount)
7525 c = hardcount;
7526 else
f73aaa43
DM
7527 c = scan - *startposp;
7528 *startposp = scan;
a687059c 7529
a3621e74 7530 DEBUG_r({
e68ec53f 7531 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 7532 DEBUG_EXECUTE_r({
e68ec53f 7533 SV * const prop = sv_newmortal();
2395827c 7534 regprop(prog, prop, p, reginfo);
e68ec53f 7535 PerlIO_printf(Perl_debug_log,
be8e71aa 7536 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 7537 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 7538 });
be8e71aa 7539 });
9041c2e3 7540
a0d0e21e 7541 return(c);
a687059c
LW
7542}
7543
c277df42 7544
be8e71aa 7545#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 7546/*
6c6525b8 7547- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
7548create a copy so that changes the caller makes won't change the shared one.
7549If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 7550 */
ffc61ed2 7551SV *
5aaab254 7552Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 7553{
6c6525b8 7554 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
7555
7556 if (altsvp) {
7557 *altsvp = NULL;
7558 }
7559
77d654fb 7560 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL));
6c6525b8 7561}
6c6525b8 7562
77d654fb
KW
7563SV *
7564Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
7565 const regnode* node,
7566 bool doinit,
7567 SV** listsvp,
7568 SV** only_utf8_locale_ptr)
6c6525b8 7569{
77d654fb
KW
7570 /* For internal core use only.
7571 * Returns the swash for the input 'node' in the regex 'prog'.
a498e2f8 7572 * If <doinit> is 'true', will attempt to create the swash if not already
8c9eb58f 7573 * done.
a498e2f8
KW
7574 * If <listsvp> is non-null, will return the printable contents of the
7575 * swash. This can be used to get debugging information even before the
7576 * swash exists, by calling this function with 'doinit' set to false, in
7577 * which case the components that will be used to eventually create the
7578 * swash are returned (in a printable form).
8c9eb58f
KW
7579 * Tied intimately to how regcomp.c sets up the data structure */
7580
97aff369 7581 dVAR;
9e55ce06 7582 SV *sw = NULL;
a498e2f8 7583 SV *si = NULL; /* Input swash initialization string */
7a6c6baa
KW
7584 SV* invlist = NULL;
7585
f8fc2ecf
YO
7586 RXi_GET_DECL(prog,progi);
7587 const struct reg_data * const data = prog ? progi->data : NULL;
ffc61ed2 7588
77d654fb 7589 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
7918f24d 7590
1ee208c4
KW
7591 assert(ANYOF_FLAGS(node)
7592 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
ccb2541c 7593
4f639d21 7594 if (data && data->count) {
a3b680e6 7595 const U32 n = ARG(node);
ffc61ed2 7596
4f639d21 7597 if (data->what[n] == 's') {
ad64d0ec
NC
7598 SV * const rv = MUTABLE_SV(data->data[n]);
7599 AV * const av = MUTABLE_AV(SvRV(rv));
2d03de9c 7600 SV **const ary = AvARRAY(av);
87367d5f 7601 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9041c2e3 7602
8c9eb58f 7603 si = *ary; /* ary[0] = the string to initialize the swash with */
b11f357e 7604
1ee208c4
KW
7605 /* Elements 3 and 4 are either both present or both absent. [3] is
7606 * any inversion list generated at compile time; [4] indicates if
7a6c6baa 7607 * that inversion list has any user-defined properties in it. */
1ee208c4
KW
7608 if (av_tindex(av) >= 2) {
7609 if (only_utf8_locale_ptr
7610 && ary[2]
7611 && ary[2] != &PL_sv_undef)
7612 {
7613 *only_utf8_locale_ptr = ary[2];
7614 }
7615 else {
316ebaf2 7616 assert(only_utf8_locale_ptr);
1ee208c4
KW
7617 *only_utf8_locale_ptr = NULL;
7618 }
7619
b9f2b683 7620 if (av_tindex(av) >= 3) {
b574cd74
KW
7621 invlist = ary[3];
7622 if (SvUV(ary[4])) {
7623 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7624 }
7625 }
7626 else {
7627 invlist = NULL;
83199d38 7628 }
1ee208c4 7629 }
7a6c6baa 7630
8c9eb58f
KW
7631 /* Element [1] is reserved for the set-up swash. If already there,
7632 * return it; if not, create it and store it there */
ce0d59fd 7633 if (ary[1] && SvROK(ary[1])) {
f192cf32
KW
7634 sw = ary[1];
7635 }
cd3413cb
KW
7636 else if (doinit && ((si && si != &PL_sv_undef)
7637 || (invlist && invlist != &PL_sv_undef))) {
316ebaf2 7638 assert(si);
7a6c6baa
KW
7639 sw = _core_swash_init("utf8", /* the utf8 package */
7640 "", /* nameless */
7641 si,
7642 1, /* binary */
7643 0, /* not from tr/// */
7a6c6baa 7644 invlist,
83199d38 7645 &swash_init_flags);
ffc61ed2
JH
7646 (void)av_store(av, 1, sw);
7647 }
7648 }
7649 }
7650
a498e2f8 7651 /* If requested, return a printable version of what this swash matches */
7a6c6baa
KW
7652 if (listsvp) {
7653 SV* matches_string = newSVpvn("", 0);
7a6c6baa 7654
a498e2f8
KW
7655 /* The swash should be used, if possible, to get the data, as it
7656 * contains the resolved data. But this function can be called at
7657 * compile-time, before everything gets resolved, in which case we
7658 * return the currently best available information, which is the string
7659 * that will eventually be used to do that resolving, 'si' */
872dd7e0
KW
7660 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7661 && (si && si != &PL_sv_undef))
7662 {
7a6c6baa
KW
7663 sv_catsv(matches_string, si);
7664 }
7665
7666 /* Add the inversion list to whatever we have. This may have come from
7667 * the swash, or from an input parameter */
7668 if (invlist) {
7669 sv_catsv(matches_string, _invlist_contents(invlist));
7670 }
7671 *listsvp = matches_string;
7672 }
7673
ffc61ed2
JH
7674 return sw;
7675}
3e63bed3 7676#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
ffc61ed2
JH
7677
7678/*
ba7b4546 7679 - reginclass - determine if a character falls into a character class
832705d4 7680
6698fab5
KW
7681 n is the ANYOF regnode
7682 p is the target string
3db24e1e 7683 p_end points to one byte beyond the end of the target string
6698fab5 7684 utf8_target tells whether p is in UTF-8.
832705d4 7685
635cd5d4 7686 Returns true if matched; false otherwise.
eba1359e 7687
d5788240
KW
7688 Note that this can be a synthetic start class, a combination of various
7689 nodes, so things you think might be mutually exclusive, such as locale,
7690 aren't. It can match both locale and non-locale
7691
bbce6d69 7692 */
7693
76e3520e 7694STATIC bool
3db24e1e 7695S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
bbce6d69 7696{
27da23d5 7697 dVAR;
a3b680e6 7698 const char flags = ANYOF_FLAGS(n);
bbce6d69 7699 bool match = FALSE;
cc07378b 7700 UV c = *p;
1aa99e6b 7701
7918f24d
NC
7702 PERL_ARGS_ASSERT_REGINCLASS;
7703
afd2eb18
KW
7704 /* If c is not already the code point, get it. Note that
7705 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7706 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 7707 STRLEN c_len = 0;
3db24e1e 7708 c = utf8n_to_uvchr(p, p_end - p, &c_len,
6182169b
KW
7709 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7710 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7711 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7712 * UTF8_ALLOW_FFFF */
f7ab54c6 7713 if (c_len == (STRLEN)-1)
e8a70c6f 7714 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
19f67299 7715 }
4b3cda86 7716
7cdde544
KW
7717 /* If this character is potentially in the bitmap, check it */
7718 if (c < 256) {
ffc61ed2
JH
7719 if (ANYOF_BITMAP_TEST(n, c))
7720 match = TRUE;
7bc66b18 7721 else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
11454c59
KW
7722 && ! utf8_target
7723 && ! isASCII(c))
7724 {
7725 match = TRUE;
7726 }
1462525b 7727 else if (flags & ANYOF_LOCALE_FLAGS) {
b99851e1 7728 if (flags & ANYOF_LOC_FOLD) {
b99851e1
KW
7729 if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
7730 match = TRUE;
7731 }
7732 }
ee48a02a 7733 if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
31c7f561
KW
7734
7735 /* The data structure is arranged so bits 0, 2, 4, ... are set
7736 * if the class includes the Posix character class given by
7737 * bit/2; and 1, 3, 5, ... are set if the class includes the
7738 * complemented Posix class given by int(bit/2). So we loop
7739 * through the bits, each time changing whether we complement
7740 * the result or not. Suppose for the sake of illustration
7741 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7742 * is set, it means there is a match for this ANYOF node if the
7743 * character is in the class given by the expression (0 / 2 = 0
7744 * = \w). If it is in that class, isFOO_lc() will return 1,
7745 * and since 'to_complement' is 0, the result will stay TRUE,
7746 * and we exit the loop. Suppose instead that bit 0 is 0, but
7747 * bit 1 is 1. That means there is a match if the character
7748 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7749 * but will on bit 1. On the second iteration 'to_complement'
7750 * will be 1, so the exclusive or will reverse things, so we
7751 * are testing for \W. On the third iteration, 'to_complement'
7752 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
7753 * iteration would test for \S, etc.
7754 *
7755 * Note that this code assumes that all the classes are closed
7756 * under folding. For example, if a character matches \w, then
7757 * its fold does too; and vice versa. This should be true for
7758 * any well-behaved locale for all the currently defined Posix
7759 * classes, except for :lower: and :upper:, which are handled
7760 * by the pseudo-class :cased: which matches if either of the
7761 * other two does. To get rid of this assumption, an outer
7762 * loop could be used below to iterate over both the source
7763 * character, and its fold (if different) */
31c7f561
KW
7764
7765 int count = 0;
7766 int to_complement = 0;
522b3c1e 7767
31c7f561 7768 while (count < ANYOF_MAX) {
8efd3f97 7769 if (ANYOF_POSIXL_TEST(n, count)
31c7f561
KW
7770 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7771 {
7772 match = TRUE;
7773 break;
7774 }
7775 count++;
7776 to_complement ^= 1;
7777 }
ffc61ed2 7778 }
a0ed51b3 7779 }
a0ed51b3
LW
7780 }
7781
31f05a37 7782
7cdde544 7783 /* If the bitmap didn't (or couldn't) match, and something outside the
3b04b210 7784 * bitmap could match, try that. */
ef87b810 7785 if (!match) {
5c2c7b4d 7786 if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
10ee90d2 7787 match = TRUE; /* Everything above 255 matches */
e051a21d 7788 }
3b04b210 7789 else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
1ee208c4
KW
7790 || (utf8_target && (flags & ANYOF_UTF8))
7791 || ((flags & ANYOF_LOC_FOLD)
7792 && IN_UTF8_CTYPE_LOCALE
7793 && ARG(n) != ANYOF_NONBITMAP_EMPTY))
3b04b210 7794 {
1ee208c4
KW
7795 SV* only_utf8_locale = NULL;
7796 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
7797 &only_utf8_locale);
7cdde544 7798 if (sw) {
893ef8be 7799 U8 utf8_buffer[2];
7cdde544
KW
7800 U8 * utf8_p;
7801 if (utf8_target) {
7802 utf8_p = (U8 *) p;
e0193e47 7803 } else { /* Convert to utf8 */
893ef8be
KW
7804 utf8_p = utf8_buffer;
7805 append_utf8_from_native_byte(*p, &utf8_p);
7806 utf8_p = utf8_buffer;
7cdde544 7807 }
f56b6394 7808
e0193e47 7809 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 7810 match = TRUE;
e0193e47 7811 }
7cdde544 7812 }
1ee208c4
KW
7813 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7814 match = _invlist_contains_cp(only_utf8_locale, c);
7815 }
7cdde544 7816 }
5073ffbd
KW
7817
7818 if (UNICODE_IS_SUPER(c)
ae986089 7819 && (flags & ANYOF_WARN_SUPER)
5073ffbd
KW
7820 && ckWARN_d(WARN_NON_UNICODE))
7821 {
7822 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2d88a86a 7823 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
5073ffbd 7824 }
7cdde544
KW
7825 }
7826
5dbb0c08
KW
7827#if ANYOF_INVERT != 1
7828 /* Depending on compiler optimization cBOOL takes time, so if don't have to
7829 * use it, don't */
7830# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7831#endif
7832
f0fdc1c9 7833 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
5dbb0c08 7834 return (flags & ANYOF_INVERT) ^ match;
a0ed51b3 7835}
161b471a 7836
dfe13c55 7837STATIC U8 *
ea3daa5d 7838S_reghop3(U8 *s, SSize_t off, const U8* lim)
9041c2e3 7839{
6af86488
KW
7840 /* return the position 'off' UTF-8 characters away from 's', forward if
7841 * 'off' >= 0, backwards if negative. But don't go outside of position
7842 * 'lim', which better be < s if off < 0 */
7843
97aff369 7844 dVAR;
7918f24d
NC
7845
7846 PERL_ARGS_ASSERT_REGHOP3;
7847
a0ed51b3 7848 if (off >= 0) {
1aa99e6b 7849 while (off-- && s < lim) {
ffc61ed2 7850 /* XXX could check well-formedness here */
a0ed51b3 7851 s += UTF8SKIP(s);
ffc61ed2 7852 }
a0ed51b3
LW
7853 }
7854 else {
1de06328
YO
7855 while (off++ && s > lim) {
7856 s--;
7857 if (UTF8_IS_CONTINUED(*s)) {
7858 while (s > lim && UTF8_IS_CONTINUATION(*s))
7859 s--;
a0ed51b3 7860 }
1de06328 7861 /* XXX could check well-formedness here */
a0ed51b3
LW
7862 }
7863 }
7864 return s;
7865}
161b471a 7866
dfe13c55 7867STATIC U8 *
ea3daa5d 7868S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
1de06328
YO
7869{
7870 dVAR;
7918f24d
NC
7871
7872 PERL_ARGS_ASSERT_REGHOP4;
7873
1de06328
YO
7874 if (off >= 0) {
7875 while (off-- && s < rlim) {
7876 /* XXX could check well-formedness here */
7877 s += UTF8SKIP(s);
7878 }
7879 }
7880 else {
7881 while (off++ && s > llim) {
7882 s--;
7883 if (UTF8_IS_CONTINUED(*s)) {
7884 while (s > llim && UTF8_IS_CONTINUATION(*s))
7885 s--;
7886 }
7887 /* XXX could check well-formedness here */
7888 }
7889 }
7890 return s;
7891}
1de06328 7892
557f47af
DM
7893/* like reghop3, but returns NULL on overrun, rather than returning last
7894 * char pos */
7895
1de06328 7896STATIC U8 *
ea3daa5d 7897S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
a0ed51b3 7898{
97aff369 7899 dVAR;
7918f24d
NC
7900
7901 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7902
a0ed51b3 7903 if (off >= 0) {
1aa99e6b 7904 while (off-- && s < lim) {
ffc61ed2 7905 /* XXX could check well-formedness here */
a0ed51b3 7906 s += UTF8SKIP(s);
ffc61ed2 7907 }
a0ed51b3 7908 if (off >= 0)
3dab1dad 7909 return NULL;
a0ed51b3
LW
7910 }
7911 else {
1de06328
YO
7912 while (off++ && s > lim) {
7913 s--;
7914 if (UTF8_IS_CONTINUED(*s)) {
7915 while (s > lim && UTF8_IS_CONTINUATION(*s))
7916 s--;
a0ed51b3 7917 }
1de06328 7918 /* XXX could check well-formedness here */
a0ed51b3
LW
7919 }
7920 if (off <= 0)
3dab1dad 7921 return NULL;
a0ed51b3
LW
7922 }
7923 return s;
7924}
51371543 7925
a75351a1
DM
7926
7927/* when executing a regex that may have (?{}), extra stuff needs setting
7928 up that will be visible to the called code, even before the current
7929 match has finished. In particular:
7930
7931 * $_ is localised to the SV currently being matched;
7932 * pos($_) is created if necessary, ready to be updated on each call-out
7933 to code;
7934 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7935 isn't set until the current pattern is successfully finished), so that
7936 $1 etc of the match-so-far can be seen;
7937 * save the old values of subbeg etc of the current regex, and set then
7938 to the current string (again, this is normally only done at the end
7939 of execution)
a75351a1
DM
7940*/
7941
7942static void
7943S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7944{
7945 MAGIC *mg;
7946 regexp *const rex = ReANY(reginfo->prog);
bf2039a9 7947 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8adc0f72 7948
8adc0f72 7949 eval_state->rex = rex;
a75351a1 7950
a75351a1
DM
7951 if (reginfo->sv) {
7952 /* Make $_ available to executed code. */
7953 if (reginfo->sv != DEFSV) {
7954 SAVE_DEFSV;
7955 DEFSV_set(reginfo->sv);
7956 }
7957
96c2a8ff 7958 if (!(mg = mg_find_mglob(reginfo->sv))) {
a75351a1 7959 /* prepare for quick setting of pos */
96c2a8ff 7960 mg = sv_magicext_mglob(reginfo->sv);
a75351a1
DM
7961 mg->mg_len = -1;
7962 }
8adc0f72
DM
7963 eval_state->pos_magic = mg;
7964 eval_state->pos = mg->mg_len;
25fdce4a 7965 eval_state->pos_flags = mg->mg_flags;
a75351a1 7966 }
8adc0f72
DM
7967 else
7968 eval_state->pos_magic = NULL;
7969
a75351a1 7970 if (!PL_reg_curpm) {
f65e70f5
DM
7971 /* PL_reg_curpm is a fake PMOP that we can attach the current
7972 * regex to and point PL_curpm at, so that $1 et al are visible
7973 * within a /(?{})/. It's just allocated once per interpreter the
7974 * first time its needed */
a75351a1
DM
7975 Newxz(PL_reg_curpm, 1, PMOP);
7976#ifdef USE_ITHREADS
7977 {
7978 SV* const repointer = &PL_sv_undef;
7979 /* this regexp is also owned by the new PL_reg_curpm, which
7980 will try to free it. */
7981 av_push(PL_regex_padav, repointer);
b9f2b683 7982 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
a75351a1
DM
7983 PL_regex_pad = AvARRAY(PL_regex_padav);
7984 }
7985#endif
7986 }
7987 SET_reg_curpm(reginfo->prog);
8adc0f72 7988 eval_state->curpm = PL_curpm;
a75351a1
DM
7989 PL_curpm = PL_reg_curpm;
7990 if (RXp_MATCH_COPIED(rex)) {
7991 /* Here is a serious problem: we cannot rewrite subbeg,
7992 since it may be needed if this match fails. Thus
7993 $` inside (?{}) could fail... */
8adc0f72
DM
7994 eval_state->subbeg = rex->subbeg;
7995 eval_state->sublen = rex->sublen;
7996 eval_state->suboffset = rex->suboffset;
a8ee055f 7997 eval_state->subcoffset = rex->subcoffset;
a75351a1 7998#ifdef PERL_ANY_COW
8adc0f72 7999 eval_state->saved_copy = rex->saved_copy;
a75351a1
DM
8000#endif
8001 RXp_MATCH_COPIED_off(rex);
8002 }
8003 else
8adc0f72 8004 eval_state->subbeg = NULL;
a75351a1
DM
8005 rex->subbeg = (char *)reginfo->strbeg;
8006 rex->suboffset = 0;
8007 rex->subcoffset = 0;
8008 rex->sublen = reginfo->strend - reginfo->strbeg;
8009}
8010
bf2039a9
DM
8011
8012/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
a75351a1 8013
51371543 8014static void
bf2039a9 8015S_cleanup_regmatch_info_aux(pTHX_ void *arg)
51371543 8016{
97aff369 8017 dVAR;
bf2039a9
DM
8018 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8019 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
331b2dcc 8020 regmatch_slab *s;
bf2039a9 8021
2ac8ff4b
DM
8022 Safefree(aux->poscache);
8023
331b2dcc 8024 if (eval_state) {
bf2039a9 8025
331b2dcc 8026 /* undo the effects of S_setup_eval_state() */
bf2039a9 8027
331b2dcc
DM
8028 if (eval_state->subbeg) {
8029 regexp * const rex = eval_state->rex;
8030 rex->subbeg = eval_state->subbeg;
8031 rex->sublen = eval_state->sublen;
8032 rex->suboffset = eval_state->suboffset;
8033 rex->subcoffset = eval_state->subcoffset;
db2c6cb3 8034#ifdef PERL_ANY_COW
331b2dcc 8035 rex->saved_copy = eval_state->saved_copy;
ed252734 8036#endif
331b2dcc
DM
8037 RXp_MATCH_COPIED_on(rex);
8038 }
8039 if (eval_state->pos_magic)
25fdce4a 8040 {
331b2dcc 8041 eval_state->pos_magic->mg_len = eval_state->pos;
25fdce4a
FC
8042 eval_state->pos_magic->mg_flags =
8043 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8044 | (eval_state->pos_flags & MGf_BYTES);
8045 }
331b2dcc
DM
8046
8047 PL_curpm = eval_state->curpm;
8adc0f72 8048 }
bf2039a9 8049
331b2dcc
DM
8050 PL_regmatch_state = aux->old_regmatch_state;
8051 PL_regmatch_slab = aux->old_regmatch_slab;
8052
8053 /* free all slabs above current one - this must be the last action
8054 * of this function, as aux and eval_state are allocated within
8055 * slabs and may be freed here */
8056
8057 s = PL_regmatch_slab->next;
8058 if (s) {
8059 PL_regmatch_slab->next = NULL;
8060 while (s) {
8061 regmatch_slab * const osl = s;
8062 s = s->next;
8063 Safefree(osl);
8064 }
8065 }
51371543 8066}
33b8afdf 8067
8adc0f72 8068
33b8afdf 8069STATIC void
5aaab254 8070S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 8071{
7e0d5ad7
KW
8072 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8073 * on the converted value */
8074
a1cac82e 8075 int i = 1;
7918f24d
NC
8076
8077 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8078
a1cac82e
NC
8079 do {
8080 if (prog->substrs->data[i].substr
8081 && !prog->substrs->data[i].utf8_substr) {
8082 SV* const sv = newSVsv(prog->substrs->data[i].substr);
8083 prog->substrs->data[i].utf8_substr = sv;
8084 sv_utf8_upgrade(sv);
610460f9 8085 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 8086 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
8087 /* Trim the trailing \n that fbm_compile added last
8088 time. */
8089 SvCUR_set(sv, SvCUR(sv) - 1);
8090 /* Whilst this makes the SV technically "invalid" (as its
8091 buffer is no longer followed by "\0") when fbm_compile()
8092 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
8093 fbm_compile(sv, FBMcf_TAIL);
8094 } else
8095 fbm_compile(sv, 0);
610460f9 8096 }
a1cac82e
NC
8097 if (prog->substrs->data[i].substr == prog->check_substr)
8098 prog->check_utf8 = sv;
8099 }
8100 } while (i--);
33b8afdf
JH
8101}
8102
7e0d5ad7 8103STATIC bool
5aaab254 8104S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 8105{
7e0d5ad7
KW
8106 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8107 * on the converted value; returns FALSE if can't be converted. */
8108
97aff369 8109 dVAR;
a1cac82e 8110 int i = 1;
7918f24d
NC
8111
8112 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8113
a1cac82e
NC
8114 do {
8115 if (prog->substrs->data[i].utf8_substr
8116 && !prog->substrs->data[i].substr) {
8117 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
8118 if (! sv_utf8_downgrade(sv, TRUE)) {
8119 return FALSE;
8120 }
5400f398
KW
8121 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8122 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8123 /* Trim the trailing \n that fbm_compile added last
8124 time. */
8125 SvCUR_set(sv, SvCUR(sv) - 1);
8126 fbm_compile(sv, FBMcf_TAIL);
8127 } else
8128 fbm_compile(sv, 0);
8129 }
a1cac82e
NC
8130 prog->substrs->data[i].substr = sv;
8131 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8132 prog->check_substr = sv;
33b8afdf 8133 }
a1cac82e 8134 } while (i--);
7e0d5ad7
KW
8135
8136 return TRUE;
33b8afdf 8137}
66610fdd
RGS
8138
8139/*
8140 * Local variables:
8141 * c-indentation-style: bsd
8142 * c-basic-offset: 4
14d04a33 8143 * indent-tabs-mode: nil
66610fdd
RGS
8144 * End:
8145 *
14d04a33 8146 * ex: set ts=8 sts=4 sw=4 et:
37442d52 8147 */