This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
eliminate RF_warned flag from PL_reg_flags
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
ef07e810 96#define RF_tainted 1 /* tainted information used? e.g. locale */
faec1544 97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
e0193e47 104/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
635cd5d4 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 115#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad
YO
117#define HOPc(pos,off) \
118 (char *)(PL_reg_match_utf8 \
52657f30 119 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
120 : (U8*)(pos + off))
121#define HOPBACKc(pos, off) \
07be1b83
YO
122 (char*)(PL_reg_match_utf8\
123 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
124 : (pos - off >= PL_bostr) \
8e11feef 125 ? (U8*)pos - off \
3dab1dad 126 : NULL)
efb30f32 127
e7409c1b 128#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 129#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 130
7016d6eb
DM
131
132#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
133#define NEXTCHR_IS_EOS (nextchr < 0)
134
135#define SET_nextchr \
136 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
137
138#define SET_locinput(p) \
139 locinput = (p); \
140 SET_nextchr
141
142
c7304fe2
KW
143#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
144 if (!swash_ptr) { \
145 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
146 ENTER; save_re_context(); \
147 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
148 1, 0, NULL, &flags); \
149 assert(swash_ptr); \
150 } \
151 } STMT_END
152
153/* If in debug mode, we test that a known character properly matches */
154#ifdef DEBUGGING
155# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
156 property_name, \
157 utf8_char_in_property) \
158 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
159 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
160#else
161# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
162 property_name, \
163 utf8_char_in_property) \
164 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
165#endif
d1eb3177 166
c7304fe2
KW
167#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
168 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
169 swash_property_names[_CC_WORDCHAR], \
170 GREEK_SMALL_LETTER_IOTA_UTF8)
171
172#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
173 STMT_START { \
174 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
175 "_X_regular_begin", \
176 GREEK_SMALL_LETTER_IOTA_UTF8); \
177 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
178 "_X_extend", \
179 COMBINING_GRAVE_ACCENT_UTF8); \
180 } STMT_END
d1eb3177 181
c7304fe2 182#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
183/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
184
5f80c4cf 185/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
186/* it would be nice to rework regcomp.sym to generate this stuff. sigh
187 *
188 * NOTE that *nothing* that affects backtracking should be in here, specifically
189 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
190 * node that is in between two EXACT like nodes when ascertaining what the required
191 * "follow" character is. This should probably be moved to regex compile time
192 * although it may be done at run time beause of the REF possibility - more
193 * investigation required. -- demerphq
194*/
3e901dc0
YO
195#define JUMPABLE(rn) ( \
196 OP(rn) == OPEN || \
197 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
198 OP(rn) == EVAL || \
cca55fe3
JP
199 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
200 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 201 OP(rn) == KEEPS || \
3dab1dad 202 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 203)
ee9b8eae 204#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 205
ee9b8eae
YO
206#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
207
208#if 0
209/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
210 we don't need this definition. */
211#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 212#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
213#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
214
215#else
216/* ... so we use this as its faster. */
217#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 218#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
219#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
220#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
221
222#endif
e2d8ce26 223
a84d97b6
HS
224/*
225 Search for mandatory following text node; for lookahead, the text must
226 follow but for lookbehind (rn->flags != 0) we skip to the next step.
227*/
cca55fe3 228#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
229 while (JUMPABLE(rn)) { \
230 const OPCODE type = OP(rn); \
231 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 232 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 233 else if (type == PLUS) \
cca55fe3 234 rn = NEXTOPER(rn); \
3dab1dad 235 else if (type == IFMATCH) \
a84d97b6 236 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 237 else rn += NEXT_OFF(rn); \
3dab1dad 238 } \
5f80c4cf 239} STMT_END
74750237 240
22913b96
KW
241/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
242 * These are for the pre-composed Hangul syllables, which are all in a
243 * contiguous block and arranged there in such a way so as to facilitate
244 * alorithmic determination of their characteristics. As such, they don't need
245 * a swash, but can be determined by simple arithmetic. Almost all are
246 * GCB=LVT, but every 28th one is a GCB=LV */
247#define SBASE 0xAC00 /* Start of block */
248#define SCount 11172 /* Length of block */
249#define TCount 28
c476f425 250
acfe0abc 251static void restore_pos(pTHX_ void *arg);
51371543 252
87c0511b 253#define REGCP_PAREN_ELEMS 3
f067efbf 254#define REGCP_OTHER_ELEMS 3
e0fa7e2b 255#define REGCP_FRAME_ELEMS 1
620d5b66
NC
256/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
257 * are needed for the regexp context stack bookkeeping. */
258
76e3520e 259STATIC CHECKPOINT
92da3157 260S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 261{
97aff369 262 dVAR;
a3b680e6 263 const int retval = PL_savestack_ix;
92da3157
DM
264 const int paren_elems_to_push =
265 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
266 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
267 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 268 I32 p;
40a82448 269 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 270
b93070ed
DM
271 PERL_ARGS_ASSERT_REGCPPUSH;
272
e49a9654 273 if (paren_elems_to_push < 0)
5637ef5b
NC
274 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
275 paren_elems_to_push);
e49a9654 276
e0fa7e2b
NC
277 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
278 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 279 " out of range (%lu-%ld)",
92da3157
DM
280 total_elems,
281 (unsigned long)maxopenparen,
282 (long)parenfloor);
e0fa7e2b 283
620d5b66 284 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 285
495f47a5 286 DEBUG_BUFFERS_r(
92da3157 287 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
288 PerlIO_printf(Perl_debug_log,
289 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
290 PTR2UV(rex),
291 PTR2UV(rex->offs)
292 );
293 );
92da3157 294 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 295/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
296 SSPUSHINT(rex->offs[p].end);
297 SSPUSHINT(rex->offs[p].start);
1ca2007e 298 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 299 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
300 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
301 (UV)p,
302 (IV)rex->offs[p].start,
303 (IV)rex->offs[p].start_tmp,
304 (IV)rex->offs[p].end
40a82448 305 ));
a0d0e21e 306 }
b1ce53c5 307/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 308 SSPUSHINT(maxopenparen);
b93070ed
DM
309 SSPUSHINT(rex->lastparen);
310 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 311 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 312
a0d0e21e
LW
313 return retval;
314}
315
c277df42 316/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
317#define REGCP_SET(cp) \
318 DEBUG_STATE_r( \
ab3bbdeb 319 PerlIO_printf(Perl_debug_log, \
e4f74956 320 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
321 (IV)PL_savestack_ix)); \
322 cp = PL_savestack_ix
c3464db5 323
ab3bbdeb 324#define REGCP_UNWIND(cp) \
e4f74956 325 DEBUG_STATE_r( \
ab3bbdeb 326 if (cp != PL_savestack_ix) \
e4f74956
YO
327 PerlIO_printf(Perl_debug_log, \
328 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
329 (IV)(cp), (IV)PL_savestack_ix)); \
330 regcpblow(cp)
c277df42 331
a8d1f4b4
DM
332#define UNWIND_PAREN(lp, lcp) \
333 for (n = rex->lastparen; n > lp; n--) \
334 rex->offs[n].end = -1; \
335 rex->lastparen = n; \
336 rex->lastcloseparen = lcp;
337
338
f067efbf 339STATIC void
92da3157 340S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 341{
97aff369 342 dVAR;
e0fa7e2b 343 UV i;
87c0511b 344 U32 paren;
a3621e74
YO
345 GET_RE_DEBUG_FLAGS_DECL;
346
7918f24d
NC
347 PERL_ARGS_ASSERT_REGCPPOP;
348
b1ce53c5 349 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 350 i = SSPOPUV;
e0fa7e2b
NC
351 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
352 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
353 rex->lastcloseparen = SSPOPINT;
354 rex->lastparen = SSPOPINT;
92da3157 355 *maxopenparen_p = SSPOPINT;
b1ce53c5 356
620d5b66 357 i -= REGCP_OTHER_ELEMS;
b1ce53c5 358 /* Now restore the parentheses context. */
495f47a5
DM
359 DEBUG_BUFFERS_r(
360 if (i || rex->lastparen + 1 <= rex->nparens)
361 PerlIO_printf(Perl_debug_log,
362 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
363 PTR2UV(rex),
364 PTR2UV(rex->offs)
365 );
366 );
92da3157 367 paren = *maxopenparen_p;
620d5b66 368 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 369 I32 tmps;
1ca2007e 370 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 371 rex->offs[paren].start = SSPOPINT;
cf93c79d 372 tmps = SSPOPINT;
b93070ed
DM
373 if (paren <= rex->lastparen)
374 rex->offs[paren].end = tmps;
495f47a5
DM
375 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
376 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
377 (UV)paren,
378 (IV)rex->offs[paren].start,
379 (IV)rex->offs[paren].start_tmp,
380 (IV)rex->offs[paren].end,
381 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 382 );
87c0511b 383 paren--;
a0d0e21e 384 }
daf18116 385#if 1
dafc8851
JH
386 /* It would seem that the similar code in regtry()
387 * already takes care of this, and in fact it is in
388 * a better location to since this code can #if 0-ed out
389 * but the code in regtry() is needed or otherwise tests
390 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
391 * (as of patchlevel 7877) will fail. Then again,
392 * this code seems to be necessary or otherwise
225593e1
DM
393 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
394 * --jhi updated by dapm */
b93070ed 395 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 396 if (i > *maxopenparen_p)
b93070ed
DM
397 rex->offs[i].start = -1;
398 rex->offs[i].end = -1;
495f47a5
DM
399 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
400 " \\%"UVuf": %s ..-1 undeffing\n",
401 (UV)i,
92da3157 402 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 403 ));
a0d0e21e 404 }
dafc8851 405#endif
a0d0e21e
LW
406}
407
74088413
DM
408/* restore the parens and associated vars at savestack position ix,
409 * but without popping the stack */
410
411STATIC void
92da3157 412S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
413{
414 I32 tmpix = PL_savestack_ix;
415 PL_savestack_ix = ix;
92da3157 416 regcppop(rex, maxopenparen_p);
74088413
DM
417 PL_savestack_ix = tmpix;
418}
419
02db2b7b 420#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 421
31c7f561
KW
422STATIC bool
423S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
424{
425 /* Returns a boolean as to whether or not 'character' is a member of the
426 * Posix character class given by 'classnum' that should be equivalent to a
427 * value in the typedef '_char_class_number'.
428 *
429 * Ideally this could be replaced by a just an array of function pointers
430 * to the C library functions that implement the macros this calls.
431 * However, to compile, the precise function signatures are required, and
432 * these may vary from platform to to platform. To avoid having to figure
433 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
434 * which adds an extra layer of function call overhead (unless the C
435 * optimizer strips it away). But we don't particularly care about
436 * performance with locales anyway. */
31c7f561
KW
437
438 switch ((_char_class_number) classnum) {
15861f94 439 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561
KW
440 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
441 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
442 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
443 case _CC_ENUM_LOWER: return isLOWER_LC(character);
444 case _CC_ENUM_PRINT: return isPRINT_LC(character);
445 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
446 case _CC_ENUM_UPPER: return isUPPER_LC(character);
447 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
448 case _CC_ENUM_SPACE: return isSPACE_LC(character);
449 case _CC_ENUM_BLANK: return isBLANK_LC(character);
450 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
451 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
452 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
453 case _CC_ENUM_ASCII: return isASCII_LC(character);
454 default: /* VERTSPACE should never occur in locales */
455 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
456 }
457
458 assert(0); /* NOTREACHED */
459 return FALSE;
460}
461
3018b823
KW
462STATIC bool
463S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
464{
465 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
466 * 'character' is a member of the Posix character class given by 'classnum'
467 * that should be equivalent to a value in the typedef
468 * '_char_class_number'.
469 *
470 * This just calls isFOO_lc on the code point for the character if it is in
471 * the range 0-255. Outside that range, all characters avoid Unicode
472 * rules, ignoring any locale. So use the Unicode function if this class
473 * requires a swash, and use the Unicode macro otherwise. */
474
475 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
476
477 if (UTF8_IS_INVARIANT(*character)) {
478 return isFOO_lc(classnum, *character);
479 }
480 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
481 return isFOO_lc(classnum,
482 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
483 }
484
485 if (classnum < _FIRST_NON_SWASH_CC) {
486
487 /* Initialize the swash unless done already */
488 if (! PL_utf8_swash_ptrs[classnum]) {
489 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
490 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
491 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
492 }
493
494 return swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) character, TRUE);
495 }
496
497 switch ((_char_class_number) classnum) {
498 case _CC_ENUM_SPACE:
499 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
500
501 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
502 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
503 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
504 default: return 0; /* Things like CNTRL are always
505 below 256 */
506 }
507
508 assert(0); /* NOTREACHED */
509 return FALSE;
510}
511
a687059c 512/*
e50aee73 513 * pregexec and friends
a687059c
LW
514 */
515
76234dfb 516#ifndef PERL_IN_XSUB_RE
a687059c 517/*
c277df42 518 - pregexec - match a regexp against a string
a687059c 519 */
c277df42 520I32
5aaab254 521Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 522 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
523/* stringarg: the point in the string at which to begin matching */
524/* strend: pointer to null at end of string */
525/* strbeg: real beginning of string */
526/* minend: end of match must be >= minend bytes after stringarg. */
527/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
528 * itself is accessed via the pointers above */
529/* nosave: For optimizations. */
c277df42 530{
7918f24d
NC
531 PERL_ARGS_ASSERT_PREGEXEC;
532
c277df42 533 return
9041c2e3 534 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
535 nosave ? 0 : REXEC_COPY_STR);
536}
76234dfb 537#endif
22e551b9 538
9041c2e3 539/*
cad2e5aa
JH
540 * Need to implement the following flags for reg_anch:
541 *
542 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
543 * USE_INTUIT_ML
544 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
545 * INTUIT_AUTORITATIVE_ML
546 * INTUIT_ONCE_NOML - Intuit can match in one location only.
547 * INTUIT_ONCE_ML
548 *
549 * Another flag for this function: SECOND_TIME (so that float substrs
550 * with giant delta may be not rechecked).
551 */
552
553/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
554
3f7c398e 555/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
556 Otherwise, only SvCUR(sv) is used to get strbeg. */
557
558/* XXXX We assume that strpos is strbeg unless sv. */
559
6eb5f6b9
JH
560/* XXXX Some places assume that there is a fixed substring.
561 An update may be needed if optimizer marks as "INTUITable"
562 RExen without fixed substrings. Similarly, it is assumed that
563 lengths of all the strings are no more than minlen, thus they
564 cannot come from lookahead.
40d049e4
YO
565 (Or minlen should take into account lookahead.)
566 NOTE: Some of this comment is not correct. minlen does now take account
567 of lookahead/behind. Further research is required. -- demerphq
568
569*/
6eb5f6b9 570
2c2d71f5
JH
571/* A failure to find a constant substring means that there is no need to make
572 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 573 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
574 regtry() should be needed.
575
576 REx compiler's optimizer found 4 possible hints:
577 a) Anchored substring;
578 b) Fixed substring;
579 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 580 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 581 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
582 string which does not contradict any of them.
583 */
2c2d71f5 584
6eb5f6b9
JH
585/* Most of decisions we do here should have been done at compile time.
586 The nodes of the REx which we used for the search should have been
587 deleted from the finite automaton. */
588
cad2e5aa 589char *
288b8c02 590Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 591 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 592{
97aff369 593 dVAR;
8d919b0a 594 struct regexp *const prog = ReANY(rx);
eb578fdb 595 I32 start_shift = 0;
cad2e5aa 596 /* Should be nonnegative! */
eb578fdb
KW
597 I32 end_shift = 0;
598 char *s;
599 SV *check;
a1933d95 600 char *strbeg;
cad2e5aa 601 char *t;
f2ed9b32 602 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 603 I32 ml_anch;
eb578fdb 604 char *other_last = NULL; /* other substr checked before this */
bd61b366 605 char *check_at = NULL; /* check substr found at this pos */
d8080198 606 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 607 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 608 RXi_GET_DECL(prog,progi);
984e6dd1 609 bool is_utf8_pat;
30944b6d 610#ifdef DEBUGGING
890ce7af 611 const char * const i_strpos = strpos;
30944b6d 612#endif
a3621e74
YO
613 GET_RE_DEBUG_FLAGS_DECL;
614
7918f24d 615 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
616 PERL_UNUSED_ARG(flags);
617 PERL_UNUSED_ARG(data);
7918f24d 618
f2ed9b32 619 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 620
984e6dd1 621 is_utf8_pat = cBOOL(RX_UTF8(rx));
4fab19ce 622
ab3bbdeb 623 DEBUG_EXECUTE_r(
f2ed9b32 624 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
625 sv ? "Guessing start of match in sv for"
626 : "Guessing start of match in string for");
2a782b5b 627 );
cad2e5aa 628
c344f387
JH
629 /* CHR_DIST() would be more correct here but it makes things slow. */
630 if (prog->minlen > strend - strpos) {
a3621e74 631 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 632 "String too short... [re_intuit_start]\n"));
cad2e5aa 633 goto fail;
2c2d71f5 634 }
d8da0584 635
7016d6eb
DM
636 /* XXX we need to pass strbeg as a separate arg: the following is
637 * guesswork and can be wrong... */
638 if (sv && SvPOK(sv)) {
639 char * p = SvPVX(sv);
640 STRLEN cur = SvCUR(sv);
641 if (p <= strpos && strpos < p + cur) {
642 strbeg = p;
643 assert(p <= strend && strend <= p + cur);
644 }
645 else
646 strbeg = strend - cur;
647 }
648 else
649 strbeg = strpos;
650
1aa99e6b 651 PL_regeol = strend;
f2ed9b32 652 if (utf8_target) {
33b8afdf
JH
653 if (!prog->check_utf8 && prog->check_substr)
654 to_utf8_substr(prog);
655 check = prog->check_utf8;
656 } else {
7e0d5ad7
KW
657 if (!prog->check_substr && prog->check_utf8) {
658 if (! to_byte_substr(prog)) {
6b54ddc5 659 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
660 }
661 }
33b8afdf
JH
662 check = prog->check_substr;
663 }
bbe252da
YO
664 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
665 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
666 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 667 && !multiline ) ); /* Check after \n? */
cad2e5aa 668
7e25d62c 669 if (!ml_anch) {
bbe252da
YO
670 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
671 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 672 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
673 && sv && !SvROK(sv)
674 && (strpos != strbeg)) {
a3621e74 675 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
676 goto fail;
677 }
d46b78c6
KW
678 if (prog->check_offset_min == prog->check_offset_max
679 && !(prog->extflags & RXf_CANY_SEEN)
680 && ! multiline) /* /m can cause \n's to match that aren't
681 accounted for in the string max length.
682 See [perl #115242] */
683 {
2c2d71f5 684 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
685 I32 slen;
686
1aa99e6b 687 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 688
653099ff
GS
689 if (SvTAIL(check)) {
690 slen = SvCUR(check); /* >= 1 */
cad2e5aa 691
9041c2e3 692 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 693 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 694 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 695 goto fail_finish;
cad2e5aa
JH
696 }
697 /* Now should match s[0..slen-2] */
698 slen--;
3f7c398e 699 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 700 || (slen > 1
3f7c398e 701 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 702 report_neq:
a3621e74 703 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
704 goto fail_finish;
705 }
cad2e5aa 706 }
3f7c398e 707 else if (*SvPVX_const(check) != *s
653099ff 708 || ((slen = SvCUR(check)) > 1
3f7c398e 709 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 710 goto report_neq;
c315bfe8 711 check_at = s;
2c2d71f5 712 goto success_at_start;
7e25d62c 713 }
cad2e5aa 714 }
2c2d71f5 715 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 716 s = strpos;
2c2d71f5 717 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
718 end_shift = prog->check_end_shift;
719
2c2d71f5 720 if (!ml_anch) {
a3b680e6 721 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 722 - (SvTAIL(check) != 0);
a3b680e6 723 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
724
725 if (end_shift < eshift)
726 end_shift = eshift;
727 }
cad2e5aa 728 }
2c2d71f5 729 else { /* Can match at random position */
cad2e5aa
JH
730 ml_anch = 0;
731 s = strpos;
1de06328
YO
732 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
733 end_shift = prog->check_end_shift;
734
735 /* end shift should be non negative here */
cad2e5aa
JH
736 }
737
bcdf7404 738#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 739 if (end_shift < 0)
1de06328 740 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 741 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
742#endif
743
2c2d71f5
JH
744 restart:
745 /* Find a possible match in the region s..strend by looking for
746 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
747
748 {
749 I32 srch_start_shift = start_shift;
750 I32 srch_end_shift = end_shift;
c33e64f0
FC
751 U8* start_point;
752 U8* end_point;
1de06328
YO
753 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
754 srch_end_shift -= ((strbeg - s) - srch_start_shift);
755 srch_start_shift = strbeg - s;
756 }
6bda09f9 757 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
758 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
759 (IV)prog->check_offset_min,
760 (IV)srch_start_shift,
761 (IV)srch_end_shift,
762 (IV)prog->check_end_shift);
763 });
764
bbe252da 765 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
766 start_point= (U8*)(s + srch_start_shift);
767 end_point= (U8*)(strend - srch_end_shift);
768 } else {
769 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
770 end_point= HOP3(strend, -srch_end_shift, strbeg);
771 }
6bda09f9 772 DEBUG_OPTIMISE_MORE_r({
56570a2c 773 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 774 (int)(end_point - start_point),
fc8cd66c 775 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
776 start_point);
777 });
778
779 s = fbm_instr( start_point, end_point,
7fba1cd6 780 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 781 }
cad2e5aa
JH
782 /* Update the count-of-usability, remove useless subpatterns,
783 unshift s. */
2c2d71f5 784
ab3bbdeb 785 DEBUG_EXECUTE_r({
f2ed9b32 786 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
787 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
788 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 789 (s ? "Found" : "Did not find"),
f2ed9b32 790 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
791 ? "anchored" : "floating"),
792 quoted,
793 RE_SV_TAIL(check),
794 (s ? " at offset " : "...\n") );
795 });
2c2d71f5
JH
796
797 if (!s)
798 goto fail_finish;
2c2d71f5 799 /* Finish the diagnostic message */
a3621e74 800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 801
1de06328
YO
802 /* XXX dmq: first branch is for positive lookbehind...
803 Our check string is offset from the beginning of the pattern.
804 So we need to do any stclass tests offset forward from that
805 point. I think. :-(
806 */
807
808
809
810 check_at=s;
811
812
2c2d71f5
JH
813 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
814 Start with the other substr.
815 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 816 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
817 *always* match. Probably should be marked during compile...
818 Probably it is right to do no SCREAM here...
819 */
820
f2ed9b32 821 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
822 : (prog->float_substr && prog->anchored_substr))
823 {
30944b6d 824 /* Take into account the "other" substring. */
2c2d71f5
JH
825 /* XXXX May be hopelessly wrong for UTF... */
826 if (!other_last)
6eb5f6b9 827 other_last = strpos;
f2ed9b32 828 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
829 do_other_anchored:
830 {
890ce7af
AL
831 char * const last = HOP3c(s, -start_shift, strbeg);
832 char *last1, *last2;
be8e71aa 833 char * const saved_s = s;
33b8afdf 834 SV* must;
2c2d71f5 835
2c2d71f5
JH
836 t = s - prog->check_offset_max;
837 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 838 && (!utf8_target
0ce71af7 839 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 840 && t > strpos)))
6f207bd3 841 NOOP;
2c2d71f5
JH
842 else
843 t = strpos;
1aa99e6b 844 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
845 if (t < other_last) /* These positions already checked */
846 t = other_last;
1aa99e6b 847 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
848 if (last < last1)
849 last1 = last;
1de06328
YO
850 /* XXXX It is not documented what units *_offsets are in.
851 We assume bytes, but this is clearly wrong.
852 Meaning this code needs to be carefully reviewed for errors.
853 dmq.
854 */
855
2c2d71f5 856 /* On end-of-str: see comment below. */
f2ed9b32 857 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
858 if (must == &PL_sv_undef) {
859 s = (char*)NULL;
1de06328 860 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
861 }
862 else
863 s = fbm_instr(
864 (unsigned char*)t,
865 HOP3(HOP3(last1, prog->anchored_offset, strend)
866 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
867 must,
7fba1cd6 868 multiline ? FBMrf_MULTILINE : 0
33b8afdf 869 );
ab3bbdeb 870 DEBUG_EXECUTE_r({
f2ed9b32 871 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
872 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
873 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 874 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
875 quoted, RE_SV_TAIL(must));
876 });
877
878
2c2d71f5
JH
879 if (!s) {
880 if (last1 >= last2) {
a3621e74 881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
882 ", giving up...\n"));
883 goto fail_finish;
884 }
a3621e74 885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 886 ", trying floating at offset %ld...\n",
be8e71aa 887 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
888 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
889 s = HOP3c(last, 1, strend);
2c2d71f5
JH
890 goto restart;
891 }
892 else {
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 894 (long)(s - i_strpos)));
1aa99e6b
IH
895 t = HOP3c(s, -prog->anchored_offset, strbeg);
896 other_last = HOP3c(s, 1, strend);
be8e71aa 897 s = saved_s;
2c2d71f5
JH
898 if (t == strpos)
899 goto try_at_start;
2c2d71f5
JH
900 goto try_at_offset;
901 }
30944b6d 902 }
2c2d71f5
JH
903 }
904 else { /* Take into account the floating substring. */
33b8afdf 905 char *last, *last1;
be8e71aa 906 char * const saved_s = s;
33b8afdf
JH
907 SV* must;
908
909 t = HOP3c(s, -start_shift, strbeg);
910 last1 = last =
911 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
912 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
913 last = HOP3c(t, prog->float_max_offset, strend);
914 s = HOP3c(t, prog->float_min_offset, strend);
915 if (s < other_last)
916 s = other_last;
2c2d71f5 917 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 918 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
919 /* fbm_instr() takes into account exact value of end-of-str
920 if the check is SvTAIL(ed). Since false positives are OK,
921 and end-of-str is not later than strend we are OK. */
922 if (must == &PL_sv_undef) {
923 s = (char*)NULL;
1de06328 924 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
925 }
926 else
2c2d71f5 927 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
928 (unsigned char*)last + SvCUR(must)
929 - (SvTAIL(must)!=0),
7fba1cd6 930 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 931 DEBUG_EXECUTE_r({
f2ed9b32 932 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
933 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
934 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 935 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
936 quoted, RE_SV_TAIL(must));
937 });
33b8afdf
JH
938 if (!s) {
939 if (last1 == last) {
a3621e74 940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
941 ", giving up...\n"));
942 goto fail_finish;
2c2d71f5 943 }
a3621e74 944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 945 ", trying anchored starting at offset %ld...\n",
be8e71aa 946 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
947 other_last = last;
948 s = HOP3c(t, 1, strend);
949 goto restart;
950 }
951 else {
a3621e74 952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
953 (long)(s - i_strpos)));
954 other_last = s; /* Fix this later. --Hugo */
be8e71aa 955 s = saved_s;
33b8afdf
JH
956 if (t == strpos)
957 goto try_at_start;
958 goto try_at_offset;
959 }
2c2d71f5 960 }
cad2e5aa 961 }
2c2d71f5 962
1de06328 963
9ef43ace 964 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 965
6bda09f9 966 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
967 PerlIO_printf(Perl_debug_log,
968 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
969 (IV)prog->check_offset_min,
970 (IV)prog->check_offset_max,
971 (IV)(s-strpos),
972 (IV)(t-strpos),
973 (IV)(t-s),
974 (IV)(strend-strpos)
975 )
976 );
977
2c2d71f5 978 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 979 && (!utf8_target
9ef43ace 980 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
981 && t > strpos)))
982 {
2c2d71f5
JH
983 /* Fixed substring is found far enough so that the match
984 cannot start at strpos. */
985 try_at_offset:
cad2e5aa 986 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
987 /* Eventually fbm_*() should handle this, but often
988 anchored_offset is not 0, so this check will not be wasted. */
989 /* XXXX In the code below we prefer to look for "^" even in
990 presence of anchored substrings. And we search even
991 beyond the found float position. These pessimizations
992 are historical artefacts only. */
993 find_anchor:
2c2d71f5 994 while (t < strend - prog->minlen) {
cad2e5aa 995 if (*t == '\n') {
4ee3650e 996 if (t < check_at - prog->check_offset_min) {
f2ed9b32 997 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
998 /* Since we moved from the found position,
999 we definitely contradict the found anchored
30944b6d
IZ
1000 substr. Due to the above check we do not
1001 contradict "check" substr.
1002 Thus we can arrive here only if check substr
1003 is float. Redo checking for "other"=="fixed".
1004 */
9041c2e3 1005 strpos = t + 1;
a3621e74 1006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1007 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1008 goto do_other_anchored;
1009 }
4ee3650e
GS
1010 /* We don't contradict the found floating substring. */
1011 /* XXXX Why not check for STCLASS? */
cad2e5aa 1012 s = t + 1;
a3621e74 1013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1014 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1015 goto set_useful;
1016 }
4ee3650e
GS
1017 /* Position contradicts check-string */
1018 /* XXXX probably better to look for check-string
1019 than for "\n", so one should lower the limit for t? */
a3621e74 1020 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1021 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1022 other_last = strpos = s = t + 1;
cad2e5aa
JH
1023 goto restart;
1024 }
1025 t++;
1026 }
a3621e74 1027 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1028 PL_colors[0], PL_colors[1]));
2c2d71f5 1029 goto fail_finish;
cad2e5aa 1030 }
f5952150 1031 else {
a3621e74 1032 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1033 PL_colors[0], PL_colors[1]));
f5952150 1034 }
cad2e5aa
JH
1035 s = t;
1036 set_useful:
f2ed9b32 1037 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1038 }
1039 else {
f5952150 1040 /* The found string does not prohibit matching at strpos,
2c2d71f5 1041 - no optimization of calling REx engine can be performed,
f5952150
GS
1042 unless it was an MBOL and we are not after MBOL,
1043 or a future STCLASS check will fail this. */
2c2d71f5
JH
1044 try_at_start:
1045 /* Even in this situation we may use MBOL flag if strpos is offset
1046 wrt the start of the string. */
05b4157f 1047 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1048 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1049 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1050 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1051 {
cad2e5aa
JH
1052 t = strpos;
1053 goto find_anchor;
1054 }
a3621e74 1055 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1056 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1057 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1058 );
2c2d71f5 1059 success_at_start:
bbe252da 1060 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1061 && (utf8_target ? (
33b8afdf
JH
1062 prog->check_utf8 /* Could be deleted already */
1063 && --BmUSEFUL(prog->check_utf8) < 0
1064 && (prog->check_utf8 == prog->float_utf8)
1065 ) : (
1066 prog->check_substr /* Could be deleted already */
1067 && --BmUSEFUL(prog->check_substr) < 0
1068 && (prog->check_substr == prog->float_substr)
1069 )))
66e933ab 1070 {
cad2e5aa 1071 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1072 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1073 /* XXX Does the destruction order has to change with utf8_target? */
1074 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1075 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1076 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1077 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1078 check = NULL; /* abort */
cad2e5aa 1079 s = strpos;
486ec47a 1080 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1081 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1082 if (prog->intflags & PREGf_IMPLICIT)
1083 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1084 /* XXXX This is a remnant of the old implementation. It
1085 looks wasteful, since now INTUIT can use many
6eb5f6b9 1086 other heuristics. */
bbe252da 1087 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1088 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1089 }
1090 else
1091 s = strpos;
1092 }
1093
6eb5f6b9
JH
1094 /* Last resort... */
1095 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1096 /* trie stclasses are too expensive to use here, we are better off to
1097 leave it to regmatch itself */
f8fc2ecf 1098 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1099 /* minlen == 0 is possible if regstclass is \b or \B,
1100 and the fixed substr is ''$.
1101 Since minlen is already taken into account, s+1 is before strend;
1102 accidentally, minlen >= 1 guaranties no false positives at s + 1
1103 even for \b or \B. But (minlen? 1 : 0) below assumes that
1104 regstclass does not come from lookahead... */
1105 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1106 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1107 const U8* const str = (U8*)STRING(progi->regstclass);
1108 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1109 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1110 : 1);
1de06328
YO
1111 char * endpos;
1112 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1113 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1114 else if (prog->float_substr || prog->float_utf8)
1115 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1116 else
1117 endpos= strend;
1118
d8080198
YO
1119 if (checked_upto < s)
1120 checked_upto = s;
1121 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1122 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1123
6eb5f6b9 1124 t = s;
984e6dd1
DM
1125 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1126 NULL, is_utf8_pat);
d8080198
YO
1127 if (s) {
1128 checked_upto = s;
1129 } else {
6eb5f6b9 1130#ifdef DEBUGGING
cbbf8932 1131 const char *what = NULL;
6eb5f6b9
JH
1132#endif
1133 if (endpos == strend) {
a3621e74 1134 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1135 "Could not match STCLASS...\n") );
1136 goto fail;
1137 }
a3621e74 1138 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1139 "This position contradicts STCLASS...\n") );
bbe252da 1140 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1141 goto fail;
d8080198
YO
1142 checked_upto = HOPBACKc(endpos, start_shift);
1143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1144 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1145 /* Contradict one of substrings */
33b8afdf 1146 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1147 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1148 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1149 hop_and_restart:
1aa99e6b 1150 s = HOP3c(t, 1, strend);
66e933ab
GS
1151 if (s + start_shift + end_shift > strend) {
1152 /* XXXX Should be taken into account earlier? */
a3621e74 1153 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1154 "Could not match STCLASS...\n") );
1155 goto fail;
1156 }
5e39e1e5
HS
1157 if (!check)
1158 goto giveup;
a3621e74 1159 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1160 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1161 what, (long)(s + start_shift - i_strpos)) );
1162 goto restart;
1163 }
66e933ab 1164 /* Have both, check_string is floating */
6eb5f6b9
JH
1165 if (t + start_shift >= check_at) /* Contradicts floating=check */
1166 goto retry_floating_check;
1167 /* Recheck anchored substring, but not floating... */
9041c2e3 1168 s = check_at;
5e39e1e5
HS
1169 if (!check)
1170 goto giveup;
a3621e74 1171 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1172 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1173 (long)(other_last - i_strpos)) );
1174 goto do_other_anchored;
1175 }
60e71179
GS
1176 /* Another way we could have checked stclass at the
1177 current position only: */
1178 if (ml_anch) {
1179 s = t = t + 1;
5e39e1e5
HS
1180 if (!check)
1181 goto giveup;
a3621e74 1182 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1183 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1184 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1185 goto try_at_offset;
66e933ab 1186 }
f2ed9b32 1187 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1188 goto fail;
486ec47a 1189 /* Check is floating substring. */
6eb5f6b9
JH
1190 retry_floating_check:
1191 t = check_at - start_shift;
a3621e74 1192 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1193 goto hop_and_restart;
1194 }
b7953727 1195 if (t != s) {
a3621e74 1196 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1197 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1198 (long)(t - i_strpos), (long)(s - i_strpos))
1199 );
1200 }
1201 else {
a3621e74 1202 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1203 "Does not contradict STCLASS...\n");
1204 );
1205 }
6eb5f6b9 1206 }
5e39e1e5 1207 giveup:
a3621e74 1208 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1209 PL_colors[4], (check ? "Guessed" : "Giving up"),
1210 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1211 return s;
2c2d71f5
JH
1212
1213 fail_finish: /* Substring not found */
33b8afdf 1214 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1215 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1216 fail:
a3621e74 1217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1218 PL_colors[4], PL_colors[5]));
bd61b366 1219 return NULL;
cad2e5aa 1220}
9661b544 1221
a0a388a1
YO
1222#define DECL_TRIE_TYPE(scan) \
1223 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1224 trie_type = ((scan->flags == EXACT) \
1225 ? (utf8_target ? trie_utf8 : trie_plain) \
1226 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1227
1228#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1229uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1230 STRLEN skiplen; \
1231 switch (trie_type) { \
1232 case trie_utf8_fold: \
1233 if ( foldlen>0 ) { \
1234 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1235 foldlen -= len; \
1236 uscan += len; \
1237 len=0; \
1238 } else { \
1239 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1240 len = UTF8SKIP(uc); \
1241 skiplen = UNISKIP( uvc ); \
1242 foldlen -= skiplen; \
1243 uscan = foldbuf + skiplen; \
1244 } \
1245 break; \
1246 case trie_latin_utf8_fold: \
1247 if ( foldlen>0 ) { \
1248 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1249 foldlen -= len; \
1250 uscan += len; \
1251 len=0; \
1252 } else { \
1253 len = 1; \
1254 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1255 skiplen = UNISKIP( uvc ); \
1256 foldlen -= skiplen; \
1257 uscan = foldbuf + skiplen; \
1258 } \
1259 break; \
1260 case trie_utf8: \
1261 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1262 break; \
1263 case trie_plain: \
1264 uvc = (UV)*uc; \
1265 len = 1; \
1266 } \
1267 if (uvc < 256) { \
1268 charid = trie->charmap[ uvc ]; \
1269 } \
1270 else { \
1271 charid = 0; \
1272 if (widecharmap) { \
1273 SV** const svpp = hv_fetch(widecharmap, \
1274 (char*)&uvc, sizeof(UV), 0); \
1275 if (svpp) \
1276 charid = (U16)SvIV(*svpp); \
1277 } \
1278 } \
4cadc6a9
YO
1279} STMT_END
1280
4cadc6a9
YO
1281#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1282STMT_START { \
1283 while (s <= e) { \
1284 if ( (CoNd) \
fac1af77 1285 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1286 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1287 goto got_it; \
1288 s++; \
1289 } \
1290} STMT_END
1291
1292#define REXEC_FBC_UTF8_SCAN(CoDe) \
1293STMT_START { \
9a902117 1294 while (s < strend) { \
4cadc6a9 1295 CoDe \
9a902117 1296 s += UTF8SKIP(s); \
4cadc6a9
YO
1297 } \
1298} STMT_END
1299
1300#define REXEC_FBC_SCAN(CoDe) \
1301STMT_START { \
1302 while (s < strend) { \
1303 CoDe \
1304 s++; \
1305 } \
1306} STMT_END
1307
1308#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1309REXEC_FBC_UTF8_SCAN( \
1310 if (CoNd) { \
7aee35ff 1311 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1312 goto got_it; \
1313 else \
1314 tmp = doevery; \
1315 } \
1316 else \
1317 tmp = 1; \
1318)
1319
1320#define REXEC_FBC_CLASS_SCAN(CoNd) \
1321REXEC_FBC_SCAN( \
1322 if (CoNd) { \
24b23f37 1323 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1324 goto got_it; \
1325 else \
1326 tmp = doevery; \
1327 } \
1328 else \
1329 tmp = 1; \
1330)
1331
1332#define REXEC_FBC_TRYIT \
24b23f37 1333if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1334 goto got_it
1335
e1d1eefb 1336#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1337 if (utf8_target) { \
e1d1eefb
YO
1338 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1339 } \
1340 else { \
1341 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1342 }
e1d1eefb 1343
786e8c11
YO
1344#define DUMP_EXEC_POS(li,s,doutf8) \
1345 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1346
cfaf538b
KW
1347
1348#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1349 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1350 tmp = TEST_NON_UTF8(tmp); \
1351 REXEC_FBC_UTF8_SCAN( \
1352 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1353 tmp = !tmp; \
1354 IF_SUCCESS; \
1355 } \
1356 else { \
1357 IF_FAIL; \
1358 } \
1359 ); \
1360
1361#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1362 if (s == PL_bostr) { \
1363 tmp = '\n'; \
1364 } \
1365 else { \
1366 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1367 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1368 } \
1369 tmp = TeSt1_UtF8; \
1370 LOAD_UTF8_CHARCLASS_ALNUM(); \
1371 REXEC_FBC_UTF8_SCAN( \
1372 if (tmp == ! (TeSt2_UtF8)) { \
1373 tmp = !tmp; \
1374 IF_SUCCESS; \
1375 } \
1376 else { \
1377 IF_FAIL; \
1378 } \
1379 ); \
1380
63ac0dad
KW
1381/* The only difference between the BOUND and NBOUND cases is that
1382 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1383 * NBOUND. This is accomplished by passing it in either the if or else clause,
1384 * with the other one being empty */
1385#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1386 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1387
1388#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1389 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1390
1391#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1392 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1393
1394#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1395 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1396
63ac0dad
KW
1397
1398/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1399 * be passed in completely with the variable name being tested, which isn't
1400 * such a clean interface, but this is easier to read than it was before. We
1401 * are looking for the boundary (or non-boundary between a word and non-word
1402 * character. The utf8 and non-utf8 cases have the same logic, but the details
1403 * must be different. Find the "wordness" of the character just prior to this
1404 * one, and compare it with the wordness of this one. If they differ, we have
1405 * a boundary. At the beginning of the string, pretend that the previous
1406 * character was a new-line */
cfaf538b 1407#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1408 if (utf8_target) { \
cfaf538b 1409 UTF8_CODE \
63ac0dad
KW
1410 } \
1411 else { /* Not utf8 */ \
1412 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1413 tmp = TEST_NON_UTF8(tmp); \
1414 REXEC_FBC_SCAN( \
1415 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1416 tmp = !tmp; \
1417 IF_SUCCESS; \
1418 } \
1419 else { \
1420 IF_FAIL; \
1421 } \
1422 ); \
1423 } \
1424 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1425 goto got_it;
1426
786e8c11
YO
1427/* We know what class REx starts with. Try to find this position... */
1428/* if reginfo is NULL, its a dryrun */
1429/* annoyingly all the vars in this routine have different names from their counterparts
1430 in regmatch. /grrr */
1431
3c3eec57 1432STATIC char *
07be1b83 1433S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
984e6dd1 1434 const char *strend, regmatch_info *reginfo, bool is_utf8_pat)
a687059c 1435{
73104a1b
KW
1436 dVAR;
1437 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1438 char *pat_string; /* The pattern's exactish string */
1439 char *pat_end; /* ptr to end char of pat_string */
1440 re_fold_t folder; /* Function for computing non-utf8 folds */
1441 const U8 *fold_array; /* array for folding ords < 256 */
1442 STRLEN ln;
1443 STRLEN lnc;
73104a1b
KW
1444 U8 c1;
1445 U8 c2;
1446 char *e;
1447 I32 tmp = 1; /* Scratch variable? */
1448 const bool utf8_target = PL_reg_match_utf8;
1449 UV utf8_fold_flags = 0;
3018b823
KW
1450 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1451 with a result inverts that result, as 0^1 =
1452 1 and 1^1 = 0 */
1453 _char_class_number classnum;
1454
73104a1b 1455 RXi_GET_DECL(prog,progi);
2f7f8cb1 1456
73104a1b 1457 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1458
73104a1b
KW
1459 /* We know what class it must start with. */
1460 switch (OP(c)) {
1461 case ANYOF:
1462 if (utf8_target) {
1463 REXEC_FBC_UTF8_CLASS_SCAN(
1464 reginclass(prog, c, (U8*)s, utf8_target));
1465 }
1466 else {
1467 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1468 }
1469 break;
1470 case CANY:
1471 REXEC_FBC_SCAN(
1472 if (tmp && (!reginfo || regtry(reginfo, &s)))
1473 goto got_it;
1474 else
1475 tmp = doevery;
1476 );
1477 break;
1478
1479 case EXACTFA:
984e6dd1 1480 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1481 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1482 goto do_exactf_utf8;
1483 }
1484 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1485 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1486 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1487
73104a1b
KW
1488 case EXACTF:
1489 if (utf8_target) {
16d951b7 1490
73104a1b
KW
1491 /* regcomp.c already folded this if pattern is in UTF-8 */
1492 utf8_fold_flags = 0;
1493 goto do_exactf_utf8;
1494 }
1495 fold_array = PL_fold;
1496 folder = foldEQ;
1497 goto do_exactf_non_utf8;
1498
1499 case EXACTFL:
984e6dd1 1500 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1501 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1502 goto do_exactf_utf8;
1503 }
1504 fold_array = PL_fold_locale;
1505 folder = foldEQ_locale;
1506 goto do_exactf_non_utf8;
3c760661 1507
73104a1b 1508 case EXACTFU_SS:
984e6dd1 1509 if (is_utf8_pat) {
73104a1b
KW
1510 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1511 }
1512 goto do_exactf_utf8;
16d951b7 1513
73104a1b
KW
1514 case EXACTFU_TRICKYFOLD:
1515 case EXACTFU:
984e6dd1
DM
1516 if (is_utf8_pat || utf8_target) {
1517 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1518 goto do_exactf_utf8;
1519 }
fac1af77 1520
73104a1b
KW
1521 /* Any 'ss' in the pattern should have been replaced by regcomp,
1522 * so we don't have to worry here about this single special case
1523 * in the Latin1 range */
1524 fold_array = PL_fold_latin1;
1525 folder = foldEQ_latin1;
1526
1527 /* FALL THROUGH */
1528
1529 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1530 are no glitches with fold-length differences
1531 between the target string and pattern */
1532
1533 /* The idea in the non-utf8 EXACTF* cases is to first find the
1534 * first character of the EXACTF* node and then, if necessary,
1535 * case-insensitively compare the full text of the node. c1 is the
1536 * first character. c2 is its fold. This logic will not work for
1537 * Unicode semantics and the german sharp ss, which hence should
1538 * not be compiled into a node that gets here. */
1539 pat_string = STRING(c);
1540 ln = STR_LEN(c); /* length to match in octets/bytes */
1541
1542 /* We know that we have to match at least 'ln' bytes (which is the
1543 * same as characters, since not utf8). If we have to match 3
1544 * characters, and there are only 2 availabe, we know without
1545 * trying that it will fail; so don't start a match past the
1546 * required minimum number from the far end */
1547 e = HOP3c(strend, -((I32)ln), s);
1548
1549 if (!reginfo && e < s) {
1550 e = s; /* Due to minlen logic of intuit() */
1551 }
fac1af77 1552
73104a1b
KW
1553 c1 = *pat_string;
1554 c2 = fold_array[c1];
1555 if (c1 == c2) { /* If char and fold are the same */
1556 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1557 }
1558 else {
1559 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1560 }
1561 break;
fac1af77 1562
73104a1b
KW
1563 do_exactf_utf8:
1564 {
1565 unsigned expansion;
1566
1567 /* If one of the operands is in utf8, we can't use the simpler folding
1568 * above, due to the fact that many different characters can have the
1569 * same fold, or portion of a fold, or different- length fold */
1570 pat_string = STRING(c);
1571 ln = STR_LEN(c); /* length to match in octets/bytes */
1572 pat_end = pat_string + ln;
984e6dd1 1573 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1574 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1575 : ln;
1576
1577 /* We have 'lnc' characters to match in the pattern, but because of
1578 * multi-character folding, each character in the target can match
1579 * up to 3 characters (Unicode guarantees it will never exceed
1580 * this) if it is utf8-encoded; and up to 2 if not (based on the
1581 * fact that the Latin 1 folds are already determined, and the
1582 * only multi-char fold in that range is the sharp-s folding to
1583 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1584 * string character. Adjust lnc accordingly, rounding up, so that
1585 * if we need to match at least 4+1/3 chars, that really is 5. */
1586 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1587 lnc = (lnc + expansion - 1) / expansion;
1588
1589 /* As in the non-UTF8 case, if we have to match 3 characters, and
1590 * only 2 are left, it's guaranteed to fail, so don't start a
1591 * match that would require us to go beyond the end of the string
1592 */
1593 e = HOP3c(strend, -((I32)lnc), s);
1594
1595 if (!reginfo && e < s) {
1596 e = s; /* Due to minlen logic of intuit() */
1597 }
0658cdde 1598
73104a1b
KW
1599 /* XXX Note that we could recalculate e to stop the loop earlier,
1600 * as the worst case expansion above will rarely be met, and as we
1601 * go along we would usually find that e moves further to the left.
1602 * This would happen only after we reached the point in the loop
1603 * where if there were no expansion we should fail. Unclear if
1604 * worth the expense */
1605
1606 while (s <= e) {
1607 char *my_strend= (char *)strend;
1608 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1609 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
73104a1b
KW
1610 && (!reginfo || regtry(reginfo, &s)) )
1611 {
1612 goto got_it;
1613 }
1614 s += (utf8_target) ? UTF8SKIP(s) : 1;
1615 }
1616 break;
1617 }
1618 case BOUNDL:
1619 PL_reg_flags |= RF_tainted;
1620 FBC_BOUND(isALNUM_LC,
1621 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1622 isALNUM_LC_utf8((U8*)s));
1623 break;
1624 case NBOUNDL:
1625 PL_reg_flags |= RF_tainted;
1626 FBC_NBOUND(isALNUM_LC,
1627 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1628 isALNUM_LC_utf8((U8*)s));
1629 break;
1630 case BOUND:
1631 FBC_BOUND(isWORDCHAR,
1632 isALNUM_uni(tmp),
03940dc2 1633 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1634 break;
1635 case BOUNDA:
1636 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1637 isWORDCHAR_A(tmp),
1638 isWORDCHAR_A((U8*)s));
1639 break;
1640 case NBOUND:
1641 FBC_NBOUND(isWORDCHAR,
1642 isALNUM_uni(tmp),
03940dc2 1643 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1644 break;
1645 case NBOUNDA:
1646 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1647 isWORDCHAR_A(tmp),
1648 isWORDCHAR_A((U8*)s));
1649 break;
1650 case BOUNDU:
1651 FBC_BOUND(isWORDCHAR_L1,
1652 isALNUM_uni(tmp),
03940dc2 1653 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1654 break;
1655 case NBOUNDU:
1656 FBC_NBOUND(isWORDCHAR_L1,
1657 isALNUM_uni(tmp),
03940dc2 1658 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1659 break;
73104a1b
KW
1660 case LNBREAK:
1661 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1662 is_LNBREAK_latin1_safe(s, strend)
1663 );
1664 break;
3018b823
KW
1665
1666 /* The argument to all the POSIX node types is the class number to pass to
1667 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1668
1669 case NPOSIXL:
1670 to_complement = 1;
1671 /* FALLTHROUGH */
1672
1673 case POSIXL:
1674 PL_reg_flags |= RF_tainted;
1675 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1676 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1677 break;
3018b823
KW
1678
1679 case NPOSIXD:
1680 to_complement = 1;
1681 /* FALLTHROUGH */
1682
1683 case POSIXD:
1684 if (utf8_target) {
1685 goto posix_utf8;
1686 }
1687 goto posixa;
1688
1689 case NPOSIXA:
1690 if (utf8_target) {
1691 /* The complement of something that matches only ASCII matches all
1692 * UTF-8 variant code points, plus everything in ASCII that isn't
1693 * in the class */
1694 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1695 || ! _generic_isCC_A(*s, FLAGS(c)));
1696 break;
1697 }
1698
1699 to_complement = 1;
1700 /* FALLTHROUGH */
1701
73104a1b 1702 case POSIXA:
3018b823 1703 posixa:
73104a1b 1704 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1705 * byte invariant character. */
1706 REXEC_FBC_CLASS_SCAN(
1707 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1708 break;
3018b823
KW
1709
1710 case NPOSIXU:
1711 to_complement = 1;
1712 /* FALLTHROUGH */
1713
1714 case POSIXU:
1715 if (! utf8_target) {
1716 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1717 FLAGS(c))));
1718 }
1719 else {
1720
1721 posix_utf8:
1722 classnum = (_char_class_number) FLAGS(c);
1723 if (classnum < _FIRST_NON_SWASH_CC) {
1724 while (s < strend) {
1725
1726 /* We avoid loading in the swash as long as possible, but
1727 * should we have to, we jump to a separate loop. This
1728 * extra 'if' statement is what keeps this code from being
1729 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1730 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1731 goto found_above_latin1;
1732 }
1733 if ((UTF8_IS_INVARIANT(*s)
1734 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1735 classnum)))
1736 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1737 && to_complement ^ cBOOL(
1738 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1739 classnum))))
1740 {
1741 if (tmp && (!reginfo || regtry(reginfo, &s)))
1742 goto got_it;
1743 else {
1744 tmp = doevery;
1745 }
1746 }
1747 else {
1748 tmp = 1;
1749 }
1750 s += UTF8SKIP(s);
1751 }
1752 }
1753 else switch (classnum) { /* These classes are implemented as
1754 macros */
1755 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1756 revert the change of \v matching this */
1757 /* FALL THROUGH */
1758
1759 case _CC_ENUM_PSXSPC:
1760 REXEC_FBC_UTF8_CLASS_SCAN(
1761 to_complement ^ cBOOL(isSPACE_utf8(s)));
1762 break;
1763
1764 case _CC_ENUM_BLANK:
1765 REXEC_FBC_UTF8_CLASS_SCAN(
1766 to_complement ^ cBOOL(isBLANK_utf8(s)));
1767 break;
1768
1769 case _CC_ENUM_XDIGIT:
1770 REXEC_FBC_UTF8_CLASS_SCAN(
1771 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1772 break;
1773
1774 case _CC_ENUM_VERTSPACE:
1775 REXEC_FBC_UTF8_CLASS_SCAN(
1776 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1777 break;
1778
1779 case _CC_ENUM_CNTRL:
1780 REXEC_FBC_UTF8_CLASS_SCAN(
1781 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1782 break;
1783
1784 default:
1785 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1786 assert(0); /* NOTREACHED */
1787 }
1788 }
1789 break;
1790
1791 found_above_latin1: /* Here we have to load a swash to get the result
1792 for the current code point */
1793 if (! PL_utf8_swash_ptrs[classnum]) {
1794 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1795 PL_utf8_swash_ptrs[classnum] =
1796 _core_swash_init("utf8", swash_property_names[classnum],
1797 &PL_sv_undef, 1, 0, NULL, &flags);
1798 }
1799
1800 /* This is a copy of the loop above for swash classes, though using the
1801 * FBC macro instead of being expanded out. Since we've loaded the
1802 * swash, we don't have to check for that each time through the loop */
1803 REXEC_FBC_UTF8_CLASS_SCAN(
1804 to_complement ^ cBOOL(_generic_utf8(
1805 classnum,
1806 s,
1807 swash_fetch(PL_utf8_swash_ptrs[classnum],
1808 (U8 *) s, TRUE))));
73104a1b
KW
1809 break;
1810
1811 case AHOCORASICKC:
1812 case AHOCORASICK:
1813 {
1814 DECL_TRIE_TYPE(c);
1815 /* what trie are we using right now */
1816 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1817 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1818 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1819
1820 const char *last_start = strend - trie->minlen;
6148ee25 1821#ifdef DEBUGGING
73104a1b 1822 const char *real_start = s;
6148ee25 1823#endif
73104a1b
KW
1824 STRLEN maxlen = trie->maxlen;
1825 SV *sv_points;
1826 U8 **points; /* map of where we were in the input string
1827 when reading a given char. For ASCII this
1828 is unnecessary overhead as the relationship
1829 is always 1:1, but for Unicode, especially
1830 case folded Unicode this is not true. */
1831 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1832 U8 *bitmap=NULL;
1833
1834
1835 GET_RE_DEBUG_FLAGS_DECL;
1836
1837 /* We can't just allocate points here. We need to wrap it in
1838 * an SV so it gets freed properly if there is a croak while
1839 * running the match */
1840 ENTER;
1841 SAVETMPS;
1842 sv_points=newSV(maxlen * sizeof(U8 *));
1843 SvCUR_set(sv_points,
1844 maxlen * sizeof(U8 *));
1845 SvPOK_on(sv_points);
1846 sv_2mortal(sv_points);
1847 points=(U8**)SvPV_nolen(sv_points );
1848 if ( trie_type != trie_utf8_fold
1849 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1850 {
1851 if (trie->bitmap)
1852 bitmap=(U8*)trie->bitmap;
1853 else
1854 bitmap=(U8*)ANYOF_BITMAP(c);
1855 }
1856 /* this is the Aho-Corasick algorithm modified a touch
1857 to include special handling for long "unknown char" sequences.
1858 The basic idea being that we use AC as long as we are dealing
1859 with a possible matching char, when we encounter an unknown char
1860 (and we have not encountered an accepting state) we scan forward
1861 until we find a legal starting char.
1862 AC matching is basically that of trie matching, except that when
1863 we encounter a failing transition, we fall back to the current
1864 states "fail state", and try the current char again, a process
1865 we repeat until we reach the root state, state 1, or a legal
1866 transition. If we fail on the root state then we can either
1867 terminate if we have reached an accepting state previously, or
1868 restart the entire process from the beginning if we have not.
1869
1870 */
1871 while (s <= last_start) {
1872 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1873 U8 *uc = (U8*)s;
1874 U16 charid = 0;
1875 U32 base = 1;
1876 U32 state = 1;
1877 UV uvc = 0;
1878 STRLEN len = 0;
1879 STRLEN foldlen = 0;
1880 U8 *uscan = (U8*)NULL;
1881 U8 *leftmost = NULL;
1882#ifdef DEBUGGING
1883 U32 accepted_word= 0;
786e8c11 1884#endif
73104a1b
KW
1885 U32 pointpos = 0;
1886
1887 while ( state && uc <= (U8*)strend ) {
1888 int failed=0;
1889 U32 word = aho->states[ state ].wordnum;
1890
1891 if( state==1 ) {
1892 if ( bitmap ) {
1893 DEBUG_TRIE_EXECUTE_r(
1894 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1895 dump_exec_pos( (char *)uc, c, strend, real_start,
1896 (char *)uc, utf8_target );
1897 PerlIO_printf( Perl_debug_log,
1898 " Scanning for legal start char...\n");
1899 }
1900 );
1901 if (utf8_target) {
1902 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1903 uc += UTF8SKIP(uc);
1904 }
1905 } else {
1906 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907 uc++;
1908 }
786e8c11 1909 }
73104a1b 1910 s= (char *)uc;
07be1b83 1911 }
73104a1b
KW
1912 if (uc >(U8*)last_start) break;
1913 }
1914
1915 if ( word ) {
1916 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1917 if (!leftmost || lpos < leftmost) {
1918 DEBUG_r(accepted_word=word);
1919 leftmost= lpos;
7016d6eb 1920 }
73104a1b 1921 if (base==0) break;
7016d6eb 1922
73104a1b
KW
1923 }
1924 points[pointpos++ % maxlen]= uc;
1925 if (foldlen || uc < (U8*)strend) {
1926 REXEC_TRIE_READ_CHAR(trie_type, trie,
1927 widecharmap, uc,
1928 uscan, len, uvc, charid, foldlen,
1929 foldbuf, uniflags);
1930 DEBUG_TRIE_EXECUTE_r({
1931 dump_exec_pos( (char *)uc, c, strend,
1932 real_start, s, utf8_target);
1933 PerlIO_printf(Perl_debug_log,
1934 " Charid:%3u CP:%4"UVxf" ",
1935 charid, uvc);
1936 });
1937 }
1938 else {
1939 len = 0;
1940 charid = 0;
1941 }
07be1b83 1942
73104a1b
KW
1943
1944 do {
6148ee25 1945#ifdef DEBUGGING
73104a1b 1946 word = aho->states[ state ].wordnum;
6148ee25 1947#endif
73104a1b
KW
1948 base = aho->states[ state ].trans.base;
1949
1950 DEBUG_TRIE_EXECUTE_r({
1951 if (failed)
1952 dump_exec_pos( (char *)uc, c, strend, real_start,
1953 s, utf8_target );
1954 PerlIO_printf( Perl_debug_log,
1955 "%sState: %4"UVxf", word=%"UVxf,
1956 failed ? " Fail transition to " : "",
1957 (UV)state, (UV)word);
1958 });
1959 if ( base ) {
1960 U32 tmp;
1961 I32 offset;
1962 if (charid &&
1963 ( ((offset = base + charid
1964 - 1 - trie->uniquecharcount)) >= 0)
1965 && ((U32)offset < trie->lasttrans)
1966 && trie->trans[offset].check == state
1967 && (tmp=trie->trans[offset].next))
1968 {
1969 DEBUG_TRIE_EXECUTE_r(
1970 PerlIO_printf( Perl_debug_log," - legal\n"));
1971 state = tmp;
1972 break;
07be1b83
YO
1973 }
1974 else {
786e8c11 1975 DEBUG_TRIE_EXECUTE_r(
73104a1b 1976 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 1977 failed = 1;
73104a1b 1978 state = aho->fail[state];
07be1b83 1979 }
07be1b83 1980 }
73104a1b
KW
1981 else {
1982 /* we must be accepting here */
1983 DEBUG_TRIE_EXECUTE_r(
1984 PerlIO_printf( Perl_debug_log," - accepting\n"));
1985 failed = 1;
1986 break;
786e8c11 1987 }
73104a1b
KW
1988 } while(state);
1989 uc += len;
1990 if (failed) {
1991 if (leftmost)
1992 break;
1993 if (!state) state = 1;
07be1b83 1994 }
73104a1b
KW
1995 }
1996 if ( aho->states[ state ].wordnum ) {
1997 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1998 if (!leftmost || lpos < leftmost) {
1999 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2000 leftmost = lpos;
07be1b83
YO
2001 }
2002 }
73104a1b
KW
2003 if (leftmost) {
2004 s = (char*)leftmost;
2005 DEBUG_TRIE_EXECUTE_r({
2006 PerlIO_printf(
2007 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2008 (UV)accepted_word, (IV)(s - real_start)
2009 );
2010 });
2011 if (!reginfo || regtry(reginfo, &s)) {
2012 FREETMPS;
2013 LEAVE;
2014 goto got_it;
2015 }
2016 s = HOPc(s,1);
2017 DEBUG_TRIE_EXECUTE_r({
2018 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2019 });
2020 } else {
2021 DEBUG_TRIE_EXECUTE_r(
2022 PerlIO_printf( Perl_debug_log,"No match.\n"));
2023 break;
2024 }
2025 }
2026 FREETMPS;
2027 LEAVE;
2028 }
2029 break;
2030 default:
2031 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2032 break;
2033 }
2034 return 0;
2035 got_it:
2036 return s;
6eb5f6b9
JH
2037}
2038
fae667d5 2039
6eb5f6b9
JH
2040/*
2041 - regexec_flags - match a regexp against a string
2042 */
2043I32
5aaab254 2044Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2045 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2046/* stringarg: the point in the string at which to begin matching */
2047/* strend: pointer to null at end of string */
2048/* strbeg: real beginning of string */
2049/* minend: end of match must be >= minend bytes after stringarg. */
2050/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2051 * itself is accessed via the pointers above */
2052/* data: May be used for some additional optimizations.
2053 Currently its only used, with a U32 cast, for transmitting
2054 the ganch offset when doing a /g match. This will change */
2055/* nosave: For optimizations. */
2056
6eb5f6b9 2057{
97aff369 2058 dVAR;
8d919b0a 2059 struct regexp *const prog = ReANY(rx);
5aaab254 2060 char *s;
eb578fdb 2061 regnode *c;
5aaab254 2062 char *startpos = stringarg;
6eb5f6b9
JH
2063 I32 minlen; /* must match at least this many chars */
2064 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2065 I32 end_shift = 0; /* Same for the end. */ /* CC */
2066 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2067 char *scream_olds = NULL;
f2ed9b32 2068 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2069 I32 multiline;
f8fc2ecf 2070 RXi_GET_DECL(prog,progi);
3b0527fe 2071 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2072 regexp_paren_pair *swap = NULL;
a3621e74
YO
2073 GET_RE_DEBUG_FLAGS_DECL;
2074
7918f24d 2075 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2076 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2077
2078 /* Be paranoid... */
2079 if (prog == NULL || startpos == NULL) {
2080 Perl_croak(aTHX_ "NULL regexp parameter");
2081 return 0;
2082 }
2083
bbe252da 2084 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2085 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2086
f2ed9b32 2087 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2088 DEBUG_EXECUTE_r(
f2ed9b32 2089 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2090 "Matching");
2091 );
bac06658 2092
6eb5f6b9 2093 minlen = prog->minlen;
1de06328
YO
2094
2095 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2096 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2097 "String too short [regexec_flags]...\n"));
2098 goto phooey;
1aa99e6b 2099 }
6eb5f6b9 2100
1de06328 2101
6eb5f6b9 2102 /* Check validity of program. */
f8fc2ecf 2103 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2104 Perl_croak(aTHX_ "corrupted regexp program");
2105 }
2106
2107 PL_reg_flags = 0;
ed301438 2108 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2109 PL_reg_maxiter = 0;
2110
984e6dd1 2111 reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
39819bd9 2112 reginfo.warned = FALSE;
6eb5f6b9 2113 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2114 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2115 PL_bostr = strbeg;
3b0527fe 2116 reginfo.sv = sv;
6eb5f6b9
JH
2117
2118 /* Mark end of line for $ (and such) */
2119 PL_regeol = strend;
2120
2121 /* see how far we have to get to not match where we matched before */
3b0527fe 2122 reginfo.till = startpos+minend;
6eb5f6b9 2123
6eb5f6b9
JH
2124 /* If there is a "must appear" string, look for it. */
2125 s = startpos;
2126
bbe252da 2127 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2128 MAGIC *mg;
2c296965 2129 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2130 reginfo.ganch = startpos + prog->gofs;
2c296965 2131 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2132 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2133 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2134 && SvMAGIC(sv)
14befaf4
DM
2135 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2136 && mg->mg_len >= 0) {
3b0527fe 2137 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2138 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2139 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2140
bbe252da 2141 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2142 if (s > reginfo.ganch)
6eb5f6b9 2143 goto phooey;
58e23c8d 2144 s = reginfo.ganch - prog->gofs;
2c296965 2145 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2146 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2147 if (s < strbeg)
2148 goto phooey;
6eb5f6b9
JH
2149 }
2150 }
58e23c8d 2151 else if (data) {
70685ca0 2152 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2153 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2154 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2155
2156 } else { /* pos() not defined */
3b0527fe 2157 reginfo.ganch = strbeg;
2c296965
YO
2158 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2159 "GPOS: reginfo.ganch = strbeg\n"));
2160 }
6eb5f6b9 2161 }
288b8c02 2162 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2163 /* We have to be careful. If the previous successful match
2164 was from this regex we don't want a subsequent partially
2165 successful match to clobber the old results.
2166 So when we detect this possibility we add a swap buffer
d8da0584
KW
2167 to the re, and switch the buffer each match. If we fail,
2168 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2169 */
2170 swap = prog->offs;
2171 /* do we need a save destructor here for eval dies? */
2172 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2173 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2174 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2175 PTR2UV(prog),
2176 PTR2UV(swap),
2177 PTR2UV(prog->offs)
2178 ));
c74340f9 2179 }
a0714e2c 2180 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2181 re_scream_pos_data d;
2182
2183 d.scream_olds = &scream_olds;
2184 d.scream_pos = &scream_pos;
288b8c02 2185 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2186 if (!s) {
a3621e74 2187 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2188 goto phooey; /* not present */
3fa9c3d7 2189 }
6eb5f6b9
JH
2190 }
2191
1de06328 2192
6eb5f6b9
JH
2193
2194 /* Simplest case: anchored match need be tried only once. */
2195 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2196 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2197 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2198 goto got_it;
bbe252da
YO
2199 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2200 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2201 {
2202 char *end;
2203
2204 if (minlen)
2205 dontbother = minlen - 1;
1aa99e6b 2206 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2207 /* for multiline we only have to try after newlines */
33b8afdf 2208 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2209 /* because of the goto we can not easily reuse the macros for bifurcating the
2210 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2211 if (utf8_target) {
2212 if (s == startpos)
2213 goto after_try_utf8;
2214 while (1) {
2215 if (regtry(&reginfo, &s)) {
2216 goto got_it;
2217 }
2218 after_try_utf8:
2219 if (s > end) {
2220 goto phooey;
2221 }
2222 if (prog->extflags & RXf_USE_INTUIT) {
2223 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2224 if (!s) {
2225 goto phooey;
2226 }
2227 }
2228 else {
2229 s += UTF8SKIP(s);
2230 }
2231 }
2232 } /* end search for check string in unicode */
2233 else {
2234 if (s == startpos) {
2235 goto after_try_latin;
2236 }
2237 while (1) {
2238 if (regtry(&reginfo, &s)) {
2239 goto got_it;
2240 }
2241 after_try_latin:
2242 if (s > end) {
2243 goto phooey;
2244 }
2245 if (prog->extflags & RXf_USE_INTUIT) {
2246 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2247 if (!s) {
2248 goto phooey;
2249 }
2250 }
2251 else {
2252 s++;
2253 }
2254 }
2255 } /* end search for check string in latin*/
2256 } /* end search for check string */
2257 else { /* search for newline */
2258 if (s > startpos) {
2259 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2260 s--;
92f3d482 2261 }
21eede78
YO
2262 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2263 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2264 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2265 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2266 goto got_it;
2267 }
92f3d482
YO
2268 }
2269 } /* end search for newline */
2270 } /* end anchored/multiline check string search */
6eb5f6b9 2271 goto phooey;
bbe252da 2272 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2273 {
486ec47a 2274 /* the warning about reginfo.ganch being used without initialization
bbe252da 2275 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2276 and we only enter this block when the same bit is set. */
58e23c8d 2277 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2278
2279 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2280 goto got_it;
2281 goto phooey;
2282 }
2283
2284 /* Messy cases: unanchored match. */
bbe252da 2285 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2286 /* we have /x+whatever/ */
984e6dd1 2287 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2288 char ch;
bf93d4cc
GS
2289#ifdef DEBUGGING
2290 int did_match = 0;
2291#endif
f2ed9b32 2292 if (utf8_target) {
7e0d5ad7
KW
2293 if (! prog->anchored_utf8) {
2294 to_utf8_substr(prog);
2295 }
2296 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2297 REXEC_FBC_SCAN(
6eb5f6b9 2298 if (*s == ch) {
a3621e74 2299 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2300 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2301 s += UTF8SKIP(s);
2302 while (s < strend && *s == ch)
2303 s += UTF8SKIP(s);
2304 }
4cadc6a9 2305 );
7e0d5ad7 2306
6eb5f6b9
JH
2307 }
2308 else {
7e0d5ad7
KW
2309 if (! prog->anchored_substr) {
2310 if (! to_byte_substr(prog)) {
6b54ddc5 2311 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2312 }
2313 }
2314 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2315 REXEC_FBC_SCAN(
6eb5f6b9 2316 if (*s == ch) {
a3621e74 2317 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2318 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2319 s++;
2320 while (s < strend && *s == ch)
2321 s++;
2322 }
4cadc6a9 2323 );
6eb5f6b9 2324 }
a3621e74 2325 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2326 PerlIO_printf(Perl_debug_log,
b7953727
JH
2327 "Did not find anchored character...\n")
2328 );
6eb5f6b9 2329 }
a0714e2c
SS
2330 else if (prog->anchored_substr != NULL
2331 || prog->anchored_utf8 != NULL
2332 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2333 && prog->float_max_offset < strend - s)) {
2334 SV *must;
2335 I32 back_max;
2336 I32 back_min;
2337 char *last;
6eb5f6b9 2338 char *last1; /* Last position checked before */
bf93d4cc
GS
2339#ifdef DEBUGGING
2340 int did_match = 0;
2341#endif
33b8afdf 2342 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2343 if (utf8_target) {
2344 if (! prog->anchored_utf8) {
2345 to_utf8_substr(prog);
2346 }
2347 must = prog->anchored_utf8;
2348 }
2349 else {
2350 if (! prog->anchored_substr) {
2351 if (! to_byte_substr(prog)) {
6b54ddc5 2352 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2353 }
2354 }
2355 must = prog->anchored_substr;
2356 }
33b8afdf
JH
2357 back_max = back_min = prog->anchored_offset;
2358 } else {
7e0d5ad7
KW
2359 if (utf8_target) {
2360 if (! prog->float_utf8) {
2361 to_utf8_substr(prog);
2362 }
2363 must = prog->float_utf8;
2364 }
2365 else {
2366 if (! prog->float_substr) {
2367 if (! to_byte_substr(prog)) {
6b54ddc5 2368 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2369 }
2370 }
2371 must = prog->float_substr;
2372 }
33b8afdf
JH
2373 back_max = prog->float_max_offset;
2374 back_min = prog->float_min_offset;
2375 }
1de06328 2376
1de06328
YO
2377 if (back_min<0) {
2378 last = strend;
2379 } else {
2380 last = HOP3c(strend, /* Cannot start after this */
2381 -(I32)(CHR_SVLEN(must)
2382 - (SvTAIL(must) != 0) + back_min), strbeg);
2383 }
6eb5f6b9
JH
2384 if (s > PL_bostr)
2385 last1 = HOPc(s, -1);
2386 else
2387 last1 = s - 1; /* bogus */
2388
a0288114 2389 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2390 check_substr==must. */
2391 scream_pos = -1;
2392 dontbother = end_shift;
2393 strend = HOPc(strend, -dontbother);
2394 while ( (s <= last) &&
c33e64f0 2395 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2396 (unsigned char*)strend, must,
c33e64f0 2397 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2398 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2399 if (HOPc(s, -back_max) > last1) {
2400 last1 = HOPc(s, -back_min);
2401 s = HOPc(s, -back_max);
2402 }
2403 else {
52657f30 2404 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2405
2406 last1 = HOPc(s, -back_min);
52657f30 2407 s = t;
6eb5f6b9 2408 }
f2ed9b32 2409 if (utf8_target) {
6eb5f6b9 2410 while (s <= last1) {
24b23f37 2411 if (regtry(&reginfo, &s))
6eb5f6b9 2412 goto got_it;
7016d6eb
DM
2413 if (s >= last1) {
2414 s++; /* to break out of outer loop */
2415 break;
2416 }
2417 s += UTF8SKIP(s);
6eb5f6b9
JH
2418 }
2419 }
2420 else {
2421 while (s <= last1) {
24b23f37 2422 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2423 goto got_it;
2424 s++;
2425 }
2426 }
2427 }
ab3bbdeb 2428 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2429 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2430 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2431 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2432 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2433 ? "anchored" : "floating"),
ab3bbdeb
YO
2434 quoted, RE_SV_TAIL(must));
2435 });
6eb5f6b9
JH
2436 goto phooey;
2437 }
f8fc2ecf 2438 else if ( (c = progi->regstclass) ) {
f14c76ed 2439 if (minlen) {
f8fc2ecf 2440 const OPCODE op = OP(progi->regstclass);
66e933ab 2441 /* don't bother with what can't match */
786e8c11 2442 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2443 strend = HOPc(strend, -(minlen - 1));
2444 }
a3621e74 2445 DEBUG_EXECUTE_r({
be8e71aa 2446 SV * const prop = sv_newmortal();
32fc9b6a 2447 regprop(prog, prop, c);
0df25f3d 2448 {
f2ed9b32 2449 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2450 s,strend-s,60);
0df25f3d 2451 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2452 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2453 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2454 quoted, (int)(strend - s));
0df25f3d 2455 }
ffc61ed2 2456 });
984e6dd1 2457 if (find_byclass(prog, c, s, strend, &reginfo, reginfo.is_utf8_pat))
6eb5f6b9 2458 goto got_it;
07be1b83 2459 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2460 }
2461 else {
2462 dontbother = 0;
a0714e2c 2463 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2464 /* Trim the end. */
6af40bd7 2465 char *last= NULL;
33b8afdf 2466 SV* float_real;
c33e64f0
FC
2467 STRLEN len;
2468 const char *little;
33b8afdf 2469
7e0d5ad7
KW
2470 if (utf8_target) {
2471 if (! prog->float_utf8) {
2472 to_utf8_substr(prog);
2473 }
2474 float_real = prog->float_utf8;
2475 }
2476 else {
2477 if (! prog->float_substr) {
2478 if (! to_byte_substr(prog)) {
6b54ddc5 2479 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2480 }
2481 }
2482 float_real = prog->float_substr;
2483 }
d6a28714 2484
c33e64f0
FC
2485 little = SvPV_const(float_real, len);
2486 if (SvTAIL(float_real)) {
7f18ad16
KW
2487 /* This means that float_real contains an artificial \n on
2488 * the end due to the presence of something like this:
2489 * /foo$/ where we can match both "foo" and "foo\n" at the
2490 * end of the string. So we have to compare the end of the
2491 * string first against the float_real without the \n and
2492 * then against the full float_real with the string. We
2493 * have to watch out for cases where the string might be
2494 * smaller than the float_real or the float_real without
2495 * the \n. */
1a13b075
YO
2496 char *checkpos= strend - len;
2497 DEBUG_OPTIMISE_r(
2498 PerlIO_printf(Perl_debug_log,
2499 "%sChecking for float_real.%s\n",
2500 PL_colors[4], PL_colors[5]));
2501 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2502 /* can't match, even if we remove the trailing \n
2503 * string is too short to match */
1a13b075
YO
2504 DEBUG_EXECUTE_r(
2505 PerlIO_printf(Perl_debug_log,
2506 "%sString shorter than required trailing substring, cannot match.%s\n",
2507 PL_colors[4], PL_colors[5]));
2508 goto phooey;
2509 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2510 /* can match, the end of the string matches without the
2511 * "\n" */
1a13b075
YO
2512 last = checkpos + 1;
2513 } else if (checkpos < strbeg) {
7f18ad16
KW
2514 /* cant match, string is too short when the "\n" is
2515 * included */
1a13b075
YO
2516 DEBUG_EXECUTE_r(
2517 PerlIO_printf(Perl_debug_log,
2518 "%sString does not contain required trailing substring, cannot match.%s\n",
2519 PL_colors[4], PL_colors[5]));
2520 goto phooey;
2521 } else if (!multiline) {
7f18ad16
KW
2522 /* non multiline match, so compare with the "\n" at the
2523 * end of the string */
1a13b075
YO
2524 if (memEQ(checkpos, little, len)) {
2525 last= checkpos;
2526 } else {
2527 DEBUG_EXECUTE_r(
2528 PerlIO_printf(Perl_debug_log,
2529 "%sString does not contain required trailing substring, cannot match.%s\n",
2530 PL_colors[4], PL_colors[5]));
2531 goto phooey;
2532 }
2533 } else {
7f18ad16
KW
2534 /* multiline match, so we have to search for a place
2535 * where the full string is located */
d6a28714 2536 goto find_last;
1a13b075 2537 }
c33e64f0 2538 } else {
d6a28714 2539 find_last:
9041c2e3 2540 if (len)
d6a28714 2541 last = rninstr(s, strend, little, little + len);
b8c5462f 2542 else
a0288114 2543 last = strend; /* matching "$" */
b8c5462f 2544 }
6af40bd7 2545 if (!last) {
7f18ad16
KW
2546 /* at one point this block contained a comment which was
2547 * probably incorrect, which said that this was a "should not
2548 * happen" case. Even if it was true when it was written I am
2549 * pretty sure it is not anymore, so I have removed the comment
2550 * and replaced it with this one. Yves */
6bda09f9
YO
2551 DEBUG_EXECUTE_r(
2552 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2553 "String does not contain required substring, cannot match.\n"
2554 ));
2555 goto phooey;
bf93d4cc 2556 }
d6a28714
JH
2557 dontbother = strend - last + prog->float_min_offset;
2558 }
2559 if (minlen && (dontbother < minlen))
2560 dontbother = minlen - 1;
2561 strend -= dontbother; /* this one's always in bytes! */
2562 /* We don't know much -- general case. */
f2ed9b32 2563 if (utf8_target) {
d6a28714 2564 for (;;) {
24b23f37 2565 if (regtry(&reginfo, &s))
d6a28714
JH
2566 goto got_it;
2567 if (s >= strend)
2568 break;
b8c5462f 2569 s += UTF8SKIP(s);
d6a28714
JH
2570 };
2571 }
2572 else {
2573 do {
24b23f37 2574 if (regtry(&reginfo, &s))
d6a28714
JH
2575 goto got_it;
2576 } while (s++ < strend);
2577 }
2578 }
2579
2580 /* Failure. */
2581 goto phooey;
2582
2583got_it:
495f47a5
DM
2584 DEBUG_BUFFERS_r(
2585 if (swap)
2586 PerlIO_printf(Perl_debug_log,
2587 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2588 PTR2UV(prog),
2589 PTR2UV(swap)
2590 );
2591 );
e9105d30 2592 Safefree(swap);
288b8c02 2593 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
d6a28714 2594
ed301438 2595 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2596 restore_pos(aTHX_ prog);
5daac39c
NC
2597 if (RXp_PAREN_NAMES(prog))
2598 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2599
2600 /* make sure $`, $&, $', and $digit will work later */
2601 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2602 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2603#ifdef PERL_ANY_COW
2604 if (SvCANCOW(sv)) {
ed252734
NC
2605 if (DEBUG_C_TEST) {
2606 PerlIO_printf(Perl_debug_log,
2607 "Copy on write: regexp capture, type %d\n",
2608 (int) SvTYPE(sv));
2609 }
77f8f7c1 2610 RX_MATCH_COPY_FREE(rx);
ed252734 2611 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2612 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2613 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2614 prog->sublen = PL_regeol - strbeg;
2615 prog->suboffset = 0;
2616 prog->subcoffset = 0;
ed252734
NC
2617 } else
2618#endif
2619 {
6502e081
DM
2620 I32 min = 0;
2621 I32 max = PL_regeol - strbeg;
2622 I32 sublen;
2623
2624 if ( (flags & REXEC_COPY_SKIP_POST)
2625 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2626 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2627 ) { /* don't copy $' part of string */
3de645a8 2628 U32 n = 0;
6502e081
DM
2629 max = -1;
2630 /* calculate the right-most part of the string covered
2631 * by a capture. Due to look-ahead, this may be to
2632 * the right of $&, so we have to scan all captures */
2633 while (n <= prog->lastparen) {
2634 if (prog->offs[n].end > max)
2635 max = prog->offs[n].end;
2636 n++;
2637 }
2638 if (max == -1)
2639 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2640 ? prog->offs[0].start
2641 : 0;
2642 assert(max >= 0 && max <= PL_regeol - strbeg);
2643 }
2644
2645 if ( (flags & REXEC_COPY_SKIP_PRE)
2646 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2647 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2648 ) { /* don't copy $` part of string */
3de645a8 2649 U32 n = 0;
6502e081
DM
2650 min = max;
2651 /* calculate the left-most part of the string covered
2652 * by a capture. Due to look-behind, this may be to
2653 * the left of $&, so we have to scan all captures */
2654 while (min && n <= prog->lastparen) {
2655 if ( prog->offs[n].start != -1
2656 && prog->offs[n].start < min)
2657 {
2658 min = prog->offs[n].start;
2659 }
2660 n++;
2661 }
2662 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2663 && min > prog->offs[0].end
2664 )
2665 min = prog->offs[0].end;
2666
2667 }
2668
2669 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2670 sublen = max - min;
2671
2672 if (RX_MATCH_COPIED(rx)) {
2673 if (sublen > prog->sublen)
2674 prog->subbeg =
2675 (char*)saferealloc(prog->subbeg, sublen+1);
2676 }
2677 else
2678 prog->subbeg = (char*)safemalloc(sublen+1);
2679 Copy(strbeg + min, prog->subbeg, sublen, char);
2680 prog->subbeg[sublen] = '\0';
2681 prog->suboffset = min;
2682 prog->sublen = sublen;
77f8f7c1 2683 RX_MATCH_COPIED_on(rx);
6502e081 2684 }
6502e081
DM
2685 prog->subcoffset = prog->suboffset;
2686 if (prog->suboffset && utf8_target) {
2687 /* Convert byte offset to chars.
2688 * XXX ideally should only compute this if @-/@+
2689 * has been seen, a la PL_sawampersand ??? */
2690
2691 /* If there's a direct correspondence between the
2692 * string which we're matching and the original SV,
2693 * then we can use the utf8 len cache associated with
2694 * the SV. In particular, it means that under //g,
2695 * sv_pos_b2u() will use the previously cached
2696 * position to speed up working out the new length of
2697 * subcoffset, rather than counting from the start of
2698 * the string each time. This stops
2699 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2700 * from going quadratic */
2701 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2702 sv_pos_b2u(sv, &(prog->subcoffset));
2703 else
2704 prog->subcoffset = utf8_length((U8*)strbeg,
2705 (U8*)(strbeg+prog->suboffset));
2706 }
d6a28714
JH
2707 }
2708 else {
6502e081 2709 RX_MATCH_COPY_FREE(rx);
d6a28714 2710 prog->subbeg = strbeg;
6502e081
DM
2711 prog->suboffset = 0;
2712 prog->subcoffset = 0;
d6a28714
JH
2713 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2714 }
2715 }
9041c2e3 2716
d6a28714
JH
2717 return 1;
2718
2719phooey:
a3621e74 2720 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2721 PL_colors[4], PL_colors[5]));
ed301438 2722 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2723 restore_pos(aTHX_ prog);
e9105d30 2724 if (swap) {
c74340f9 2725 /* we failed :-( roll it back */
495f47a5
DM
2726 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2727 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2728 PTR2UV(prog),
2729 PTR2UV(prog->offs),
2730 PTR2UV(swap)
2731 ));
e9105d30
GG
2732 Safefree(prog->offs);
2733 prog->offs = swap;
2734 }
d6a28714
JH
2735 return 0;
2736}
2737
6bda09f9 2738
ec43f78b
DM
2739/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2740 * Do inc before dec, in case old and new rex are the same */
2741#define SET_reg_curpm(Re2) \
2742 if (PL_reg_state.re_state_eval_setup_done) { \
2743 (void)ReREFCNT_inc(Re2); \
2744 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2745 PM_SETRE((PL_reg_curpm), (Re2)); \
2746 }
2747
2748
d6a28714
JH
2749/*
2750 - regtry - try match at specific point
2751 */
2752STATIC I32 /* 0 failure, 1 success */
f73aaa43 2753S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2754{
97aff369 2755 dVAR;
d6a28714 2756 CHECKPOINT lastcp;
288b8c02 2757 REGEXP *const rx = reginfo->prog;
8d919b0a 2758 regexp *const prog = ReANY(rx);
f73aaa43 2759 I32 result;
f8fc2ecf 2760 RXi_GET_DECL(prog,progi);
a3621e74 2761 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2762
2763 PERL_ARGS_ASSERT_REGTRY;
2764
24b23f37 2765 reginfo->cutpoint=NULL;
d6a28714 2766
ed301438
DM
2767 if ((prog->extflags & RXf_EVAL_SEEN)
2768 && !PL_reg_state.re_state_eval_setup_done)
2769 {
d6a28714
JH
2770 MAGIC *mg;
2771
ed301438 2772 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2773 if (reginfo->sv) {
d6a28714 2774 /* Make $_ available to executed code. */
3b0527fe 2775 if (reginfo->sv != DEFSV) {
59f00321 2776 SAVE_DEFSV;
414bf5ae 2777 DEFSV_set(reginfo->sv);
b8c5462f 2778 }
d6a28714 2779
3b0527fe
DM
2780 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2781 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2782 /* prepare for quick setting of pos */
d300d9fa 2783#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2784 if (SvIsCOW(reginfo->sv))
2785 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2786#endif
3dab1dad 2787 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2788 &PL_vtbl_mglob, NULL, 0);
d6a28714 2789 mg->mg_len = -1;
b8c5462f 2790 }
d6a28714
JH
2791 PL_reg_magic = mg;
2792 PL_reg_oldpos = mg->mg_len;
4f639d21 2793 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2794 }
09687e5a 2795 if (!PL_reg_curpm) {
a02a5408 2796 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2797#ifdef USE_ITHREADS
2798 {
14a49a24 2799 SV* const repointer = &PL_sv_undef;
92313705
NC
2800 /* this regexp is also owned by the new PL_reg_curpm, which
2801 will try to free it. */
d2ece331 2802 av_push(PL_regex_padav, repointer);
09687e5a
AB
2803 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2804 PL_regex_pad = AvARRAY(PL_regex_padav);
2805 }
2806#endif
2807 }
ec43f78b 2808 SET_reg_curpm(rx);
d6a28714
JH
2809 PL_reg_oldcurpm = PL_curpm;
2810 PL_curpm = PL_reg_curpm;
07bc277f 2811 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2812 /* Here is a serious problem: we cannot rewrite subbeg,
2813 since it may be needed if this match fails. Thus
2814 $` inside (?{}) could fail... */
2815 PL_reg_oldsaved = prog->subbeg;
2816 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2817 PL_reg_oldsavedoffset = prog->suboffset;
2818 PL_reg_oldsavedcoffset = prog->suboffset;
db2c6cb3 2819#ifdef PERL_ANY_COW
ed252734
NC
2820 PL_nrs = prog->saved_copy;
2821#endif
07bc277f 2822 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2823 }
2824 else
bd61b366 2825 PL_reg_oldsaved = NULL;
d6a28714 2826 prog->subbeg = PL_bostr;
6502e081
DM
2827 prog->suboffset = 0;
2828 prog->subcoffset = 0;
d6a28714
JH
2829 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2830 }
97ca13b7 2831#ifdef DEBUGGING
f73aaa43 2832 PL_reg_starttry = *startposp;
97ca13b7 2833#endif
f73aaa43 2834 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2835 prog->lastparen = 0;
03994de8 2836 prog->lastcloseparen = 0;
d6a28714
JH
2837
2838 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2839 to do this again and again, prog->lastparen should take care of
3dd2943c 2840 this! --ilya*/
dafc8851
JH
2841
2842 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2843 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2844 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2845 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2846 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2847 * Meanwhile, this code *is* needed for the
daf18116
JH
2848 * above-mentioned test suite tests to succeed. The common theme
2849 * on those tests seems to be returning null fields from matches.
225593e1 2850 * --jhi updated by dapm */
dafc8851 2851#if 1
d6a28714 2852 if (prog->nparens) {
b93070ed 2853 regexp_paren_pair *pp = prog->offs;
eb578fdb 2854 I32 i;
b93070ed 2855 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2856 ++pp;
2857 pp->start = -1;
2858 pp->end = -1;
d6a28714
JH
2859 }
2860 }
dafc8851 2861#endif
02db2b7b 2862 REGCP_SET(lastcp);
f73aaa43
DM
2863 result = regmatch(reginfo, *startposp, progi->program + 1);
2864 if (result != -1) {
2865 prog->offs[0].end = result;
d6a28714
JH
2866 return 1;
2867 }
24b23f37 2868 if (reginfo->cutpoint)
f73aaa43 2869 *startposp= reginfo->cutpoint;
02db2b7b 2870 REGCP_UNWIND(lastcp);
d6a28714
JH
2871 return 0;
2872}
2873
02db2b7b 2874
8ba1375e
MJD
2875#define sayYES goto yes
2876#define sayNO goto no
262b90c4 2877#define sayNO_SILENT goto no_silent
8ba1375e 2878
f9f4320a
YO
2879/* we dont use STMT_START/END here because it leads to
2880 "unreachable code" warnings, which are bogus, but distracting. */
2881#define CACHEsayNO \
c476f425
DM
2882 if (ST.cache_mask) \
2883 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2884 sayNO
3298f257 2885
a3621e74 2886/* this is used to determine how far from the left messages like
265c4333
YO
2887 'failed...' are printed. It should be set such that messages
2888 are inline with the regop output that created them.
a3621e74 2889*/
265c4333 2890#define REPORT_CODE_OFF 32
a3621e74
YO
2891
2892
40a82448
DM
2893#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2894#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2895#define CHRTEST_NOT_A_CP_1 -999
2896#define CHRTEST_NOT_A_CP_2 -998
9e137952 2897
86545054
DM
2898#define SLAB_FIRST(s) (&(s)->states[0])
2899#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2900
5d9a96ca
DM
2901/* grab a new slab and return the first slot in it */
2902
2903STATIC regmatch_state *
2904S_push_slab(pTHX)
2905{
a35a87e7 2906#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2907 dMY_CXT;
2908#endif
5d9a96ca
DM
2909 regmatch_slab *s = PL_regmatch_slab->next;
2910 if (!s) {
2911 Newx(s, 1, regmatch_slab);
2912 s->prev = PL_regmatch_slab;
2913 s->next = NULL;
2914 PL_regmatch_slab->next = s;
2915 }
2916 PL_regmatch_slab = s;
86545054 2917 return SLAB_FIRST(s);
5d9a96ca 2918}
5b47454d 2919
95b24440 2920
40a82448
DM
2921/* push a new state then goto it */
2922
4d5016e5
DM
2923#define PUSH_STATE_GOTO(state, node, input) \
2924 pushinput = input; \
40a82448
DM
2925 scan = node; \
2926 st->resume_state = state; \
2927 goto push_state;
2928
2929/* push a new state with success backtracking, then goto it */
2930
4d5016e5
DM
2931#define PUSH_YES_STATE_GOTO(state, node, input) \
2932 pushinput = input; \
40a82448
DM
2933 scan = node; \
2934 st->resume_state = state; \
2935 goto push_yes_state;
2936
aa283a38 2937
aa283a38 2938
4d5016e5 2939
d6a28714 2940/*
95b24440 2941
bf1f174e
DM
2942regmatch() - main matching routine
2943
2944This is basically one big switch statement in a loop. We execute an op,
2945set 'next' to point the next op, and continue. If we come to a point which
2946we may need to backtrack to on failure such as (A|B|C), we push a
2947backtrack state onto the backtrack stack. On failure, we pop the top
2948state, and re-enter the loop at the state indicated. If there are no more
2949states to pop, we return failure.
2950
2951Sometimes we also need to backtrack on success; for example /A+/, where
2952after successfully matching one A, we need to go back and try to
2953match another one; similarly for lookahead assertions: if the assertion
2954completes successfully, we backtrack to the state just before the assertion
2955and then carry on. In these cases, the pushed state is marked as
2956'backtrack on success too'. This marking is in fact done by a chain of
2957pointers, each pointing to the previous 'yes' state. On success, we pop to
2958the nearest yes state, discarding any intermediate failure-only states.
2959Sometimes a yes state is pushed just to force some cleanup code to be
2960called at the end of a successful match or submatch; e.g. (??{$re}) uses
2961it to free the inner regex.
2962
2963Note that failure backtracking rewinds the cursor position, while
2964success backtracking leaves it alone.
2965
2966A pattern is complete when the END op is executed, while a subpattern
2967such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2968ops trigger the "pop to last yes state if any, otherwise return true"
2969behaviour.
2970
2971A common convention in this function is to use A and B to refer to the two
2972subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2973the subpattern to be matched possibly multiple times, while B is the entire
2974rest of the pattern. Variable and state names reflect this convention.
2975
2976The states in the main switch are the union of ops and failure/success of
2977substates associated with with that op. For example, IFMATCH is the op
2978that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2979'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2980successfully matched A and IFMATCH_A_fail is a state saying that we have
2981just failed to match A. Resume states always come in pairs. The backtrack
2982state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2983at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2984on success or failure.
2985
2986The struct that holds a backtracking state is actually a big union, with
2987one variant for each major type of op. The variable st points to the
2988top-most backtrack struct. To make the code clearer, within each
2989block of code we #define ST to alias the relevant union.
2990
2991Here's a concrete example of a (vastly oversimplified) IFMATCH
2992implementation:
2993
2994 switch (state) {
2995 ....
2996
2997#define ST st->u.ifmatch
2998
2999 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3000 ST.foo = ...; // some state we wish to save
95b24440 3001 ...
bf1f174e
DM
3002 // push a yes backtrack state with a resume value of
3003 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3004 // first node of A:
4d5016e5 3005 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3006 // NOTREACHED
3007
3008 case IFMATCH_A: // we have successfully executed A; now continue with B
3009 next = B;
3010 bar = ST.foo; // do something with the preserved value
3011 break;
3012
3013 case IFMATCH_A_fail: // A failed, so the assertion failed
3014 ...; // do some housekeeping, then ...
3015 sayNO; // propagate the failure
3016
3017#undef ST
95b24440 3018
bf1f174e
DM
3019 ...
3020 }
95b24440 3021
bf1f174e
DM
3022For any old-timers reading this who are familiar with the old recursive
3023approach, the code above is equivalent to:
95b24440 3024
bf1f174e
DM
3025 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3026 {
3027 int foo = ...
95b24440 3028 ...
bf1f174e
DM
3029 if (regmatch(A)) {
3030 next = B;
3031 bar = foo;
3032 break;
95b24440 3033 }
bf1f174e
DM
3034 ...; // do some housekeeping, then ...
3035 sayNO; // propagate the failure
95b24440 3036 }
bf1f174e
DM
3037
3038The topmost backtrack state, pointed to by st, is usually free. If you
3039want to claim it, populate any ST.foo fields in it with values you wish to
3040save, then do one of
3041
4d5016e5
DM
3042 PUSH_STATE_GOTO(resume_state, node, newinput);
3043 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3044
3045which sets that backtrack state's resume value to 'resume_state', pushes a
3046new free entry to the top of the backtrack stack, then goes to 'node'.
3047On backtracking, the free slot is popped, and the saved state becomes the
3048new free state. An ST.foo field in this new top state can be temporarily
3049accessed to retrieve values, but once the main loop is re-entered, it
3050becomes available for reuse.
3051
3052Note that the depth of the backtrack stack constantly increases during the
3053left-to-right execution of the pattern, rather than going up and down with
3054the pattern nesting. For example the stack is at its maximum at Z at the
3055end of the pattern, rather than at X in the following:
3056
3057 /(((X)+)+)+....(Y)+....Z/
3058
3059The only exceptions to this are lookahead/behind assertions and the cut,
3060(?>A), which pop all the backtrack states associated with A before
3061continuing.
3062
486ec47a 3063Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3064PL_regmatch_state and st always point to the currently active state,
3065and PL_regmatch_slab points to the slab currently containing
3066PL_regmatch_state. The first time regmatch() is called, the first slab is
3067allocated, and is never freed until interpreter destruction. When the slab
3068is full, a new one is allocated and chained to the end. At exit from
3069regmatch(), slabs allocated since entry are freed.
3070
3071*/
95b24440 3072
40a82448 3073
5bc10b2c 3074#define DEBUG_STATE_pp(pp) \
265c4333 3075 DEBUG_STATE_r({ \
f2ed9b32 3076 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3077 PerlIO_printf(Perl_debug_log, \
5d458dd8 3078 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3079 depth*2, "", \
13d6edb4 3080 PL_reg_name[st->resume_state], \
5d458dd8
YO
3081 ((st==yes_state||st==mark_state) ? "[" : ""), \
3082 ((st==yes_state) ? "Y" : ""), \
3083 ((st==mark_state) ? "M" : ""), \
3084 ((st==yes_state||st==mark_state) ? "]" : "") \
3085 ); \
265c4333 3086 });
5bc10b2c 3087
40a82448 3088
3dab1dad 3089#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3090
3df15adc 3091#ifdef DEBUGGING
5bc10b2c 3092
ab3bbdeb 3093STATIC void
f2ed9b32 3094S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3095 const char *start, const char *end, const char *blurb)
3096{
efd26800 3097 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3098
3099 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3100
ab3bbdeb
YO
3101 if (!PL_colorset)
3102 reginitcolors();
3103 {
3104 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3105 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3106
f2ed9b32 3107 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3108 start, end - start, 60);
3109
3110 PerlIO_printf(Perl_debug_log,
3111 "%s%s REx%s %s against %s\n",
3112 PL_colors[4], blurb, PL_colors[5], s0, s1);
3113
f2ed9b32 3114 if (utf8_target||utf8_pat)
1de06328
YO
3115 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3116 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3117 utf8_pat && utf8_target ? " and " : "",
3118 utf8_target ? "string" : ""
ab3bbdeb
YO
3119 );
3120 }
3121}
3df15adc
YO
3122
3123STATIC void
786e8c11
YO
3124S_dump_exec_pos(pTHX_ const char *locinput,
3125 const regnode *scan,
3126 const char *loc_regeol,
3127 const char *loc_bostr,
3128 const char *loc_reg_starttry,
f2ed9b32 3129 const bool utf8_target)
07be1b83 3130{
786e8c11 3131 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3132 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3133 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3134 /* The part of the string before starttry has one color
3135 (pref0_len chars), between starttry and current
3136 position another one (pref_len - pref0_len chars),
3137 after the current position the third one.
3138 We assume that pref0_len <= pref_len, otherwise we
3139 decrease pref0_len. */
786e8c11
YO
3140 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3141 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3142 int pref0_len;
3143
7918f24d
NC
3144 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3145
f2ed9b32 3146 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3147 pref_len++;
786e8c11
YO
3148 pref0_len = pref_len - (locinput - loc_reg_starttry);
3149 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3150 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3151 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3152 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3153 l--;
3154 if (pref0_len < 0)
3155 pref0_len = 0;
3156 if (pref0_len > pref_len)
3157 pref0_len = pref_len;
3158 {
f2ed9b32 3159 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3160
ab3bbdeb 3161 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3162 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3163
ab3bbdeb 3164 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3165 (locinput - pref_len + pref0_len),
1de06328 3166 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3167
ab3bbdeb 3168 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3169 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3170
1de06328 3171 const STRLEN tlen=len0+len1+len2;
3df15adc 3172 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3173 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3174 (IV)(locinput - loc_bostr),
07be1b83 3175 len0, s0,
07be1b83 3176 len1, s1,
07be1b83 3177 (docolor ? "" : "> <"),
07be1b83 3178 len2, s2,
f9f4320a 3179 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3180 "");
3181 }
3182}
3df15adc 3183
07be1b83
YO
3184#endif
3185
0a4db386
YO
3186/* reg_check_named_buff_matched()
3187 * Checks to see if a named buffer has matched. The data array of
3188 * buffer numbers corresponding to the buffer is expected to reside
3189 * in the regexp->data->data array in the slot stored in the ARG() of
3190 * node involved. Note that this routine doesn't actually care about the
3191 * name, that information is not preserved from compilation to execution.
3192 * Returns the index of the leftmost defined buffer with the given name
3193 * or 0 if non of the buffers matched.
3194 */
3195STATIC I32
7918f24d
NC
3196S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3197{
0a4db386 3198 I32 n;
f8fc2ecf 3199 RXi_GET_DECL(rex,rexi);
ad64d0ec 3200 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3201 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3202
3203 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3204
0a4db386 3205 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3206 if ((I32)rex->lastparen >= nums[n] &&
3207 rex->offs[nums[n]].end != -1)
0a4db386
YO
3208 {
3209 return nums[n];
3210 }
3211 }
3212 return 0;
3213}
3214
2f554ef7
DM
3215
3216/* free all slabs above current one - called during LEAVE_SCOPE */
3217
3218STATIC void
3219S_clear_backtrack_stack(pTHX_ void *p)
3220{
3221 regmatch_slab *s = PL_regmatch_slab->next;
3222 PERL_UNUSED_ARG(p);
3223
3224 if (!s)
3225 return;
3226 PL_regmatch_slab->next = NULL;
3227 while (s) {
3228 regmatch_slab * const osl = s;
3229 s = s->next;
3230 Safefree(osl);
3231 }
3232}
c74f6de9 3233static bool
984e6dd1
DM
3234S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3235 U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
c74f6de9 3236{
79a2a0e8
KW
3237 /* This function determines if there are one or two characters that match
3238 * the first character of the passed-in EXACTish node <text_node>, and if
3239 * so, returns them in the passed-in pointers.
c74f6de9 3240 *
79a2a0e8
KW
3241 * If it determines that no possible character in the target string can
3242 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3243 * the first character in <text_node> requires UTF-8 to represent, and the
3244 * target string isn't in UTF-8.)
c74f6de9 3245 *
79a2a0e8
KW
3246 * If there are more than two characters that could match the beginning of
3247 * <text_node>, or if more context is required to determine a match or not,
3248 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3249 *
3250 * The motiviation behind this function is to allow the caller to set up
3251 * tight loops for matching. If <text_node> is of type EXACT, there is
3252 * only one possible character that can match its first character, and so
3253 * the situation is quite simple. But things get much more complicated if
3254 * folding is involved. It may be that the first character of an EXACTFish
3255 * node doesn't participate in any possible fold, e.g., punctuation, so it
3256 * can be matched only by itself. The vast majority of characters that are
3257 * in folds match just two things, their lower and upper-case equivalents.
3258 * But not all are like that; some have multiple possible matches, or match
3259 * sequences of more than one character. This function sorts all that out.
3260 *
3261 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3262 * loop of trying to match A*, we know we can't exit where the thing
3263 * following it isn't a B. And something can't be a B unless it is the
3264 * beginning of B. By putting a quick test for that beginning in a tight
3265 * loop, we can rule out things that can't possibly be B without having to
3266 * break out of the loop, thus avoiding work. Similarly, if A is a single
3267 * character, we can make a tight loop matching A*, using the outputs of
3268 * this function.
3269 *
3270 * If the target string to match isn't in UTF-8, and there aren't
3271 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3272 * the one or two possible octets (which are characters in this situation)
3273 * that can match. In all cases, if there is only one character that can
3274 * match, *<c1p> and *<c2p> will be identical.
3275 *
3276 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3277 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3278 * can match the beginning of <text_node>. They should be declared with at
3279 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3280 * undefined what these contain.) If one or both of the buffers are
3281 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3282 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3283 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3284 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3285 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3286
3287 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3288
ddb0d839
KW
3289 UV c1 = CHRTEST_NOT_A_CP_1;
3290 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3291 bool use_chrtest_void = FALSE;
3292
3293 /* Used when we have both utf8 input and utf8 output, to avoid converting
3294 * to/from code points */
3295 bool utf8_has_been_setup = FALSE;
3296
c74f6de9
KW
3297 dVAR;
3298
b4291290 3299 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3300
79a2a0e8
KW
3301 if (OP(text_node) == EXACT) {
3302
3303 /* In an exact node, only one thing can be matched, that first
3304 * character. If both the pat and the target are UTF-8, we can just
3305 * copy the input to the output, avoiding finding the code point of
3306 * that character */
984e6dd1 3307 if (!is_utf8_pat) {
79a2a0e8
KW
3308 c2 = c1 = *pat;
3309 }
3310 else if (utf8_target) {
3311 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3312 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3313 utf8_has_been_setup = TRUE;
3314 }
3315 else {
3316 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3317 }
79a2a0e8
KW
3318 }
3319 else /* an EXACTFish node */
984e6dd1 3320 if ((is_utf8_pat
79a2a0e8
KW
3321 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3322 pat + STR_LEN(text_node)))
984e6dd1 3323 || (!is_utf8_pat
79a2a0e8
KW
3324 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3325 pat + STR_LEN(text_node))))
3326 {
3327 /* Multi-character folds require more context to sort out. Also
3328 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3329 * handled outside this routine */
3330 use_chrtest_void = TRUE;
3331 }
3332 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
984e6dd1 3333 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
79a2a0e8
KW
3334 if (c1 > 256) {
3335 /* Load the folds hash, if not already done */
3336 SV** listp;
3337 if (! PL_utf8_foldclosures) {
3338 if (! PL_utf8_tofold) {
3339 U8 dummy[UTF8_MAXBYTES+1];
3340
3341 /* Force loading this by folding an above-Latin1 char */
3342 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3343 assert(PL_utf8_tofold); /* Verify that worked */
3344 }
3345 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3346 }
3347
3348 /* The fold closures data structure is a hash with the keys being
3349 * the UTF-8 of every character that is folded to, like 'k', and
3350 * the values each an array of all code points that fold to its
3351 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3352 * not included */
3353 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3354 (char *) pat,
3355 UTF8SKIP(pat),
3356 FALSE))))
3357 {
3358 /* Not found in the hash, therefore there are no folds
3359 * containing it, so there is only a single character that
3360 * could match */
3361 c2 = c1;
3362 }
3363 else { /* Does participate in folds */
3364 AV* list = (AV*) *listp;
3365 if (av_len(list) != 1) {
3366
3367 /* If there aren't exactly two folds to this, it is outside
3368 * the scope of this function */
3369 use_chrtest_void = TRUE;
3370 }
3371 else { /* There are two. Get them */
3372 SV** c_p = av_fetch(list, 0, FALSE);
3373 if (c_p == NULL) {
3374 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3375 }
3376 c1 = SvUV(*c_p);
3377
3378 c_p = av_fetch(list, 1, FALSE);
3379 if (c_p == NULL) {
3380 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3381 }
3382 c2 = SvUV(*c_p);
3383
3384 /* Folds that cross the 255/256 boundary are forbidden if
3385 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3386 * pattern character is above 256, and its only other match
3387 * is below 256, the only legal match will be to itself.
3388 * We have thrown away the original, so have to compute
3389 * which is the one above 255 */
3390 if ((c1 < 256) != (c2 < 256)) {
3391 if (OP(text_node) == EXACTFL
3392 || (OP(text_node) == EXACTFA
3393 && (isASCII(c1) || isASCII(c2))))
3394 {
3395 if (c1 < 256) {
3396 c1 = c2;
3397 }
3398 else {
3399 c2 = c1;
3400 }
3401 }
3402 }
3403 }
3404 }
3405 }
3406 else /* Here, c1 is < 255 */
3407 if (utf8_target
3408 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3409 && OP(text_node) != EXACTFL
3410 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3411 {
3412 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3413 * folds to this character in the pattern. All such cases except
3414 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3415 * involved in their folds, so are outside the scope of this
3416 * function */
3417 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3418 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3419 }
3420 else {
3421 use_chrtest_void = TRUE;
3422 }
c74f6de9
KW
3423 }
3424 else { /* Here nothing above Latin1 can fold to the pattern character */
3425 switch (OP(text_node)) {
3426
3427 case EXACTFL: /* /l rules */
79a2a0e8 3428 c2 = PL_fold_locale[c1];
c74f6de9
KW
3429 break;
3430
3431 case EXACTF:
3432 if (! utf8_target) { /* /d rules */
79a2a0e8 3433 c2 = PL_fold[c1];
c74f6de9
KW
3434 break;
3435 }
3436 /* FALLTHROUGH */
3437 /* /u rules for all these. This happens to work for
79a2a0e8 3438 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3439 case EXACTFA:
3440 case EXACTFU_TRICKYFOLD:
79a2a0e8 3441 case EXACTFU_SS:
c74f6de9 3442 case EXACTFU:
79a2a0e8 3443 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3444 break;
3445
878531d3
KW
3446 default:
3447 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3448 assert(0); /* NOTREACHED */
c74f6de9
KW
3449 }
3450 }
3451 }
79a2a0e8
KW
3452
3453 /* Here have figured things out. Set up the returns */
3454 if (use_chrtest_void) {
3455 *c2p = *c1p = CHRTEST_VOID;
3456 }
3457 else if (utf8_target) {
3458 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3459 uvchr_to_utf8(c1_utf8, c1);
3460 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3461 }
c74f6de9 3462
79a2a0e8
KW
3463 /* Invariants are stored in both the utf8 and byte outputs; Use
3464 * negative numbers otherwise for the byte ones. Make sure that the
3465 * byte ones are the same iff the utf8 ones are the same */
3466 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3467 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3468 ? *c2_utf8
3469 : (c1 == c2)
3470 ? CHRTEST_NOT_A_CP_1
3471 : CHRTEST_NOT_A_CP_2;
3472 }
3473 else if (c1 > 255) {
3474 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3475 can represent */
3476 return FALSE;
3477 }
c74f6de9 3478
79a2a0e8
KW
3479 *c1p = *c2p = c2; /* c2 is the only representable value */
3480 }
3481 else { /* c1 is representable; see about c2 */
3482 *c1p = c1;
3483 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3484 }
2f554ef7 3485
c74f6de9
KW
3486 return TRUE;
3487}
2f554ef7 3488
f73aaa43
DM
3489/* returns -1 on failure, $+[0] on success */
3490STATIC I32
3491S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3492{
a35a87e7 3493#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3494 dMY_CXT;
3495#endif
27da23d5 3496 dVAR;
eb578fdb 3497 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3498 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3499 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3500 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3501 RXi_GET_DECL(rex,rexi);
2f554ef7 3502 I32 oldsave;
5d9a96ca 3503 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3504 regmatch_state *st;
5d9a96ca 3505 /* cache heavy used fields of st in registers */
eb578fdb
KW
3506 regnode *scan;
3507 regnode *next;
3508 U32 n = 0; /* general value; init to avoid compiler warning */
3509 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3510 char *locinput = startpos;
4d5016e5 3511 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3512 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3513
b69b0499 3514 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3515 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3516 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3517 const U32 max_nochange_depth =
3518 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3519 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3520 regmatch_state *yes_state = NULL; /* state to pop to on success of
3521 subpattern */
e2e6a0f1
YO
3522 /* mark_state piggy backs on the yes_state logic so that when we unwind
3523 the stack on success we can update the mark_state as we go */
3524 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3525 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3526 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3527 U32 state_num;
5d458dd8
YO
3528 bool no_final = 0; /* prevent failure from backtracking? */
3529 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3530 char *startpoint = locinput;
5d458dd8
YO
3531 SV *popmark = NULL; /* are we looking for a mark? */
3532 SV *sv_commit = NULL; /* last mark name seen in failure */
3533 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3534 during a successful match */
5d458dd8
YO
3535 U32 lastopen = 0; /* last open we saw */
3536 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3537 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3538 /* these three flags are set by various ops to signal information to
3539 * the very next op. They have a useful lifetime of exactly one loop
3540 * iteration, and are not preserved or restored by state pushes/pops
3541 */
3542 bool sw = 0; /* the condition value in (?(cond)a|b) */
3543 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3544 int logical = 0; /* the following EVAL is:
3545 0: (?{...})
3546 1: (?(?{...})X|Y)
3547 2: (??{...})
3548 or the following IFMATCH/UNLESSM is:
3549 false: plain (?=foo)
3550 true: used as a condition: (?(?=foo))
3551 */
81ed78b2
DM
3552 PAD* last_pad = NULL;
3553 dMULTICALL;
3554 I32 gimme = G_SCALAR;
3555 CV *caller_cv = NULL; /* who called us */
3556 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 3557 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 3558 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
3559 int to_complement; /* Invert the result? */
3560 _char_class_number classnum;