This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regex: Add pseudo-Posix class: 'cased'
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
7e0d5ad7
KW
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
991fc03a 42static const char* const non_utf8_target_but_utf8_required
7e0d5ad7
KW
43 = "Can't match, because target string needs to be in UTF-8\n";
44
6b54ddc5
YO
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
a687059c 50/*
e50aee73 51 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2 72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
a687059c 75 ****
9ef589d8
LW
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGEXEC_C
a687059c 85#include "perl.h"
0f5d15d6 86
54df2634
NC
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
a687059c 92
81e983c1 93#include "inline_invlist.c"
1b0f46bf 94#include "unicode_constants.h"
81e983c1 95
c74f6de9
KW
96#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
e0193e47 102/* Valid for non-utf8 strings: avoids the reginclass
7e2509c1
KW
103 * call if there are no complications: i.e., if everything matchable is
104 * straight forward in the bitmap */
635cd5d4 105#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
af364d03 106 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 107
c277df42
IZ
108/*
109 * Forwards.
110 */
111
f2ed9b32 112#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
53c4c00c 113#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
a0ed51b3 114
3dab1dad
YO
115#define HOPc(pos,off) \
116 (char *)(PL_reg_match_utf8 \
52657f30 117 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
3dab1dad
YO
118 : (U8*)(pos + off))
119#define HOPBACKc(pos, off) \
07be1b83
YO
120 (char*)(PL_reg_match_utf8\
121 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
122 : (pos - off >= PL_bostr) \
8e11feef 123 ? (U8*)pos - off \
3dab1dad 124 : NULL)
efb30f32 125
e7409c1b 126#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 127#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 128
7016d6eb
DM
129
130#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
131#define NEXTCHR_IS_EOS (nextchr < 0)
132
133#define SET_nextchr \
134 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
135
136#define SET_locinput(p) \
137 locinput = (p); \
138 SET_nextchr
139
140
c7304fe2
KW
141#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START { \
142 if (!swash_ptr) { \
143 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
144 ENTER; save_re_context(); \
145 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
146 1, 0, NULL, &flags); \
147 assert(swash_ptr); \
148 } \
149 } STMT_END
150
151/* If in debug mode, we test that a known character properly matches */
152#ifdef DEBUGGING
153# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
154 property_name, \
155 utf8_char_in_property) \
156 LOAD_UTF8_CHARCLASS(swash_ptr, property_name); \
157 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
158#else
159# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
160 property_name, \
161 utf8_char_in_property) \
162 LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
163#endif
d1eb3177 164
c7304fe2
KW
165#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
166 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
167 swash_property_names[_CC_WORDCHAR], \
168 GREEK_SMALL_LETTER_IOTA_UTF8)
169
170#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
171 STMT_START { \
172 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
173 "_X_regular_begin", \
174 GREEK_SMALL_LETTER_IOTA_UTF8); \
175 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
176 "_X_extend", \
177 COMBINING_GRAVE_ACCENT_UTF8); \
178 } STMT_END
d1eb3177 179
c7304fe2 180#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
181/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
182
5f80c4cf 183/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
184/* it would be nice to rework regcomp.sym to generate this stuff. sigh
185 *
186 * NOTE that *nothing* that affects backtracking should be in here, specifically
187 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
188 * node that is in between two EXACT like nodes when ascertaining what the required
189 * "follow" character is. This should probably be moved to regex compile time
190 * although it may be done at run time beause of the REF possibility - more
191 * investigation required. -- demerphq
192*/
3e901dc0
YO
193#define JUMPABLE(rn) ( \
194 OP(rn) == OPEN || \
195 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
196 OP(rn) == EVAL || \
cca55fe3
JP
197 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
198 OP(rn) == PLUS || OP(rn) == MINMOD || \
d1c771f5 199 OP(rn) == KEEPS || \
3dab1dad 200 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 201)
ee9b8eae 202#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 203
ee9b8eae
YO
204#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
205
206#if 0
207/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
208 we don't need this definition. */
209#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
fab2782b 210#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
211#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
212
213#else
214/* ... so we use this as its faster. */
215#define IS_TEXT(rn) ( OP(rn)==EXACT )
fab2782b 216#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
ee9b8eae
YO
217#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
218#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
219
220#endif
e2d8ce26 221
a84d97b6
HS
222/*
223 Search for mandatory following text node; for lookahead, the text must
224 follow but for lookbehind (rn->flags != 0) we skip to the next step.
225*/
cca55fe3 226#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
227 while (JUMPABLE(rn)) { \
228 const OPCODE type = OP(rn); \
229 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 230 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 231 else if (type == PLUS) \
cca55fe3 232 rn = NEXTOPER(rn); \
3dab1dad 233 else if (type == IFMATCH) \
a84d97b6 234 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 235 else rn += NEXT_OFF(rn); \
3dab1dad 236 } \
5f80c4cf 237} STMT_END
74750237 238
22913b96
KW
239/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
240 * These are for the pre-composed Hangul syllables, which are all in a
241 * contiguous block and arranged there in such a way so as to facilitate
242 * alorithmic determination of their characteristics. As such, they don't need
243 * a swash, but can be determined by simple arithmetic. Almost all are
244 * GCB=LVT, but every 28th one is a GCB=LV */
245#define SBASE 0xAC00 /* Start of block */
246#define SCount 11172 /* Length of block */
247#define TCount 28
c476f425 248
acfe0abc 249static void restore_pos(pTHX_ void *arg);
51371543 250
87c0511b 251#define REGCP_PAREN_ELEMS 3
f067efbf 252#define REGCP_OTHER_ELEMS 3
e0fa7e2b 253#define REGCP_FRAME_ELEMS 1
620d5b66
NC
254/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
255 * are needed for the regexp context stack bookkeeping. */
256
76e3520e 257STATIC CHECKPOINT
92da3157 258S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 259{
97aff369 260 dVAR;
a3b680e6 261 const int retval = PL_savestack_ix;
92da3157
DM
262 const int paren_elems_to_push =
263 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
264 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
265 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 266 I32 p;
40a82448 267 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 268
b93070ed
DM
269 PERL_ARGS_ASSERT_REGCPPUSH;
270
e49a9654 271 if (paren_elems_to_push < 0)
5637ef5b
NC
272 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
273 paren_elems_to_push);
e49a9654 274
e0fa7e2b
NC
275 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
276 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 277 " out of range (%lu-%ld)",
92da3157
DM
278 total_elems,
279 (unsigned long)maxopenparen,
280 (long)parenfloor);
e0fa7e2b 281
620d5b66 282 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 283
495f47a5 284 DEBUG_BUFFERS_r(
92da3157 285 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
286 PerlIO_printf(Perl_debug_log,
287 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
288 PTR2UV(rex),
289 PTR2UV(rex->offs)
290 );
291 );
92da3157 292 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 293/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
b93070ed
DM
294 SSPUSHINT(rex->offs[p].end);
295 SSPUSHINT(rex->offs[p].start);
1ca2007e 296 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 297 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
298 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
299 (UV)p,
300 (IV)rex->offs[p].start,
301 (IV)rex->offs[p].start_tmp,
302 (IV)rex->offs[p].end
40a82448 303 ));
a0d0e21e 304 }
b1ce53c5 305/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 306 SSPUSHINT(maxopenparen);
b93070ed
DM
307 SSPUSHINT(rex->lastparen);
308 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 309 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 310
a0d0e21e
LW
311 return retval;
312}
313
c277df42 314/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
315#define REGCP_SET(cp) \
316 DEBUG_STATE_r( \
ab3bbdeb 317 PerlIO_printf(Perl_debug_log, \
e4f74956 318 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
319 (IV)PL_savestack_ix)); \
320 cp = PL_savestack_ix
c3464db5 321
ab3bbdeb 322#define REGCP_UNWIND(cp) \
e4f74956 323 DEBUG_STATE_r( \
ab3bbdeb 324 if (cp != PL_savestack_ix) \
e4f74956
YO
325 PerlIO_printf(Perl_debug_log, \
326 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
327 (IV)(cp), (IV)PL_savestack_ix)); \
328 regcpblow(cp)
c277df42 329
a8d1f4b4
DM
330#define UNWIND_PAREN(lp, lcp) \
331 for (n = rex->lastparen; n > lp; n--) \
332 rex->offs[n].end = -1; \
333 rex->lastparen = n; \
334 rex->lastcloseparen = lcp;
335
336
f067efbf 337STATIC void
92da3157 338S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 339{
97aff369 340 dVAR;
e0fa7e2b 341 UV i;
87c0511b 342 U32 paren;
a3621e74
YO
343 GET_RE_DEBUG_FLAGS_DECL;
344
7918f24d
NC
345 PERL_ARGS_ASSERT_REGCPPOP;
346
b1ce53c5 347 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 348 i = SSPOPUV;
e0fa7e2b
NC
349 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
350 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
351 rex->lastcloseparen = SSPOPINT;
352 rex->lastparen = SSPOPINT;
92da3157 353 *maxopenparen_p = SSPOPINT;
b1ce53c5 354
620d5b66 355 i -= REGCP_OTHER_ELEMS;
b1ce53c5 356 /* Now restore the parentheses context. */
495f47a5
DM
357 DEBUG_BUFFERS_r(
358 if (i || rex->lastparen + 1 <= rex->nparens)
359 PerlIO_printf(Perl_debug_log,
360 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
361 PTR2UV(rex),
362 PTR2UV(rex->offs)
363 );
364 );
92da3157 365 paren = *maxopenparen_p;
620d5b66 366 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
1df70142 367 I32 tmps;
1ca2007e 368 rex->offs[paren].start_tmp = SSPOPINT;
b93070ed 369 rex->offs[paren].start = SSPOPINT;
cf93c79d 370 tmps = SSPOPINT;
b93070ed
DM
371 if (paren <= rex->lastparen)
372 rex->offs[paren].end = tmps;
495f47a5
DM
373 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
374 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
375 (UV)paren,
376 (IV)rex->offs[paren].start,
377 (IV)rex->offs[paren].start_tmp,
378 (IV)rex->offs[paren].end,
379 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 380 );
87c0511b 381 paren--;
a0d0e21e 382 }
daf18116 383#if 1
dafc8851
JH
384 /* It would seem that the similar code in regtry()
385 * already takes care of this, and in fact it is in
386 * a better location to since this code can #if 0-ed out
387 * but the code in regtry() is needed or otherwise tests
388 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
389 * (as of patchlevel 7877) will fail. Then again,
390 * this code seems to be necessary or otherwise
225593e1
DM
391 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
392 * --jhi updated by dapm */
b93070ed 393 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 394 if (i > *maxopenparen_p)
b93070ed
DM
395 rex->offs[i].start = -1;
396 rex->offs[i].end = -1;
495f47a5
DM
397 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
398 " \\%"UVuf": %s ..-1 undeffing\n",
399 (UV)i,
92da3157 400 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 401 ));
a0d0e21e 402 }
dafc8851 403#endif
a0d0e21e
LW
404}
405
74088413
DM
406/* restore the parens and associated vars at savestack position ix,
407 * but without popping the stack */
408
409STATIC void
92da3157 410S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
411{
412 I32 tmpix = PL_savestack_ix;
413 PL_savestack_ix = ix;
92da3157 414 regcppop(rex, maxopenparen_p);
74088413
DM
415 PL_savestack_ix = tmpix;
416}
417
02db2b7b 418#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 419
31c7f561
KW
420STATIC bool
421S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
422{
423 /* Returns a boolean as to whether or not 'character' is a member of the
424 * Posix character class given by 'classnum' that should be equivalent to a
425 * value in the typedef '_char_class_number'.
426 *
427 * Ideally this could be replaced by a just an array of function pointers
428 * to the C library functions that implement the macros this calls.
429 * However, to compile, the precise function signatures are required, and
430 * these may vary from platform to to platform. To avoid having to figure
431 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
432 * which adds an extra layer of function call overhead (unless the C
433 * optimizer strips it away). But we don't particularly care about
434 * performance with locales anyway. */
31c7f561
KW
435
436 switch ((_char_class_number) classnum) {
15861f94 437 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 438 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
439 case _CC_ENUM_ASCII: return isASCII_LC(character);
440 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
441 case _CC_ENUM_CASED: return isLOWER_LC(character)
442 || isUPPER_LC(character);
e8d596e0 443 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
444 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
445 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
446 case _CC_ENUM_LOWER: return isLOWER_LC(character);
447 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 448 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 449 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 450 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
451 case _CC_ENUM_UPPER: return isUPPER_LC(character);
452 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 453 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
454 default: /* VERTSPACE should never occur in locales */
455 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
456 }
457
458 assert(0); /* NOTREACHED */
459 return FALSE;
460}
461
3018b823
KW
462STATIC bool
463S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
464{
465 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
466 * 'character' is a member of the Posix character class given by 'classnum'
467 * that should be equivalent to a value in the typedef
468 * '_char_class_number'.
469 *
470 * This just calls isFOO_lc on the code point for the character if it is in
471 * the range 0-255. Outside that range, all characters avoid Unicode
472 * rules, ignoring any locale. So use the Unicode function if this class
473 * requires a swash, and use the Unicode macro otherwise. */
474
475 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
476
477 if (UTF8_IS_INVARIANT(*character)) {
478 return isFOO_lc(classnum, *character);
479 }
480 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
481 return isFOO_lc(classnum,
482 TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
483 }
484
485 if (classnum < _FIRST_NON_SWASH_CC) {
486
487 /* Initialize the swash unless done already */
488 if (! PL_utf8_swash_ptrs[classnum]) {
489 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
490 PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
491 swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
492 }
493
494 return swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *) character, TRUE);
495 }
496
497 switch ((_char_class_number) classnum) {
498 case _CC_ENUM_SPACE:
499 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
500
501 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
502 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
503 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
504 default: return 0; /* Things like CNTRL are always
505 below 256 */
506 }
507
508 assert(0); /* NOTREACHED */
509 return FALSE;
510}
511
a687059c 512/*
e50aee73 513 * pregexec and friends
a687059c
LW
514 */
515
76234dfb 516#ifndef PERL_IN_XSUB_RE
a687059c 517/*
c277df42 518 - pregexec - match a regexp against a string
a687059c 519 */
c277df42 520I32
5aaab254 521Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
c3464db5 522 char *strbeg, I32 minend, SV *screamer, U32 nosave)
8fd1a950
DM
523/* stringarg: the point in the string at which to begin matching */
524/* strend: pointer to null at end of string */
525/* strbeg: real beginning of string */
526/* minend: end of match must be >= minend bytes after stringarg. */
527/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
528 * itself is accessed via the pointers above */
529/* nosave: For optimizations. */
c277df42 530{
7918f24d
NC
531 PERL_ARGS_ASSERT_PREGEXEC;
532
c277df42 533 return
9041c2e3 534 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
535 nosave ? 0 : REXEC_COPY_STR);
536}
76234dfb 537#endif
22e551b9 538
9041c2e3 539/*
cad2e5aa
JH
540 * Need to implement the following flags for reg_anch:
541 *
542 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
543 * USE_INTUIT_ML
544 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
545 * INTUIT_AUTORITATIVE_ML
546 * INTUIT_ONCE_NOML - Intuit can match in one location only.
547 * INTUIT_ONCE_ML
548 *
549 * Another flag for this function: SECOND_TIME (so that float substrs
550 * with giant delta may be not rechecked).
551 */
552
553/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
554
3f7c398e 555/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
556 Otherwise, only SvCUR(sv) is used to get strbeg. */
557
558/* XXXX We assume that strpos is strbeg unless sv. */
559
6eb5f6b9
JH
560/* XXXX Some places assume that there is a fixed substring.
561 An update may be needed if optimizer marks as "INTUITable"
562 RExen without fixed substrings. Similarly, it is assumed that
563 lengths of all the strings are no more than minlen, thus they
564 cannot come from lookahead.
40d049e4
YO
565 (Or minlen should take into account lookahead.)
566 NOTE: Some of this comment is not correct. minlen does now take account
567 of lookahead/behind. Further research is required. -- demerphq
568
569*/
6eb5f6b9 570
2c2d71f5
JH
571/* A failure to find a constant substring means that there is no need to make
572 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 573 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
574 regtry() should be needed.
575
576 REx compiler's optimizer found 4 possible hints:
577 a) Anchored substring;
578 b) Fixed substring;
579 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 580 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 581 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
582 string which does not contradict any of them.
583 */
2c2d71f5 584
6eb5f6b9
JH
585/* Most of decisions we do here should have been done at compile time.
586 The nodes of the REx which we used for the search should have been
587 deleted from the finite automaton. */
588
cad2e5aa 589char *
288b8c02 590Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
9f61653a 591 char *strend, const U32 flags, re_scream_pos_data *data)
cad2e5aa 592{
97aff369 593 dVAR;
8d919b0a 594 struct regexp *const prog = ReANY(rx);
eb578fdb 595 I32 start_shift = 0;
cad2e5aa 596 /* Should be nonnegative! */
eb578fdb
KW
597 I32 end_shift = 0;
598 char *s;
599 SV *check;
a1933d95 600 char *strbeg;
cad2e5aa 601 char *t;
f2ed9b32 602 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
cad2e5aa 603 I32 ml_anch;
eb578fdb 604 char *other_last = NULL; /* other substr checked before this */
bd61b366 605 char *check_at = NULL; /* check substr found at this pos */
d8080198 606 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 607 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 608 RXi_GET_DECL(prog,progi);
984e6dd1 609 bool is_utf8_pat;
30944b6d 610#ifdef DEBUGGING
890ce7af 611 const char * const i_strpos = strpos;
30944b6d 612#endif
a3621e74
YO
613 GET_RE_DEBUG_FLAGS_DECL;
614
7918f24d 615 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
616 PERL_UNUSED_ARG(flags);
617 PERL_UNUSED_ARG(data);
7918f24d 618
f2ed9b32 619 RX_MATCH_UTF8_set(rx,utf8_target);
cad2e5aa 620
984e6dd1 621 is_utf8_pat = cBOOL(RX_UTF8(rx));
4fab19ce 622
ab3bbdeb 623 DEBUG_EXECUTE_r(
f2ed9b32 624 debug_start_match(rx, utf8_target, strpos, strend,
1de06328
YO
625 sv ? "Guessing start of match in sv for"
626 : "Guessing start of match in string for");
2a782b5b 627 );
cad2e5aa 628
c344f387
JH
629 /* CHR_DIST() would be more correct here but it makes things slow. */
630 if (prog->minlen > strend - strpos) {
a3621e74 631 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584 632 "String too short... [re_intuit_start]\n"));
cad2e5aa 633 goto fail;
2c2d71f5 634 }
d8da0584 635
7016d6eb
DM
636 /* XXX we need to pass strbeg as a separate arg: the following is
637 * guesswork and can be wrong... */
638 if (sv && SvPOK(sv)) {
639 char * p = SvPVX(sv);
640 STRLEN cur = SvCUR(sv);
641 if (p <= strpos && strpos < p + cur) {
642 strbeg = p;
643 assert(p <= strend && strend <= p + cur);
644 }
645 else
646 strbeg = strend - cur;
647 }
648 else
649 strbeg = strpos;
650
1aa99e6b 651 PL_regeol = strend;
f2ed9b32 652 if (utf8_target) {
33b8afdf
JH
653 if (!prog->check_utf8 && prog->check_substr)
654 to_utf8_substr(prog);
655 check = prog->check_utf8;
656 } else {
7e0d5ad7
KW
657 if (!prog->check_substr && prog->check_utf8) {
658 if (! to_byte_substr(prog)) {
6b54ddc5 659 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
660 }
661 }
33b8afdf
JH
662 check = prog->check_substr;
663 }
bbe252da
YO
664 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
665 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
666 || ( (prog->extflags & RXf_ANCH_BOL)
7fba1cd6 667 && !multiline ) ); /* Check after \n? */
cad2e5aa 668
7e25d62c 669 if (!ml_anch) {
bbe252da
YO
670 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
671 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
3f7c398e 672 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
7e25d62c
JH
673 && sv && !SvROK(sv)
674 && (strpos != strbeg)) {
a3621e74 675 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
7e25d62c
JH
676 goto fail;
677 }
d46b78c6
KW
678 if (prog->check_offset_min == prog->check_offset_max
679 && !(prog->extflags & RXf_CANY_SEEN)
680 && ! multiline) /* /m can cause \n's to match that aren't
681 accounted for in the string max length.
682 See [perl #115242] */
683 {
2c2d71f5 684 /* Substring at constant offset from beg-of-str... */
cad2e5aa
JH
685 I32 slen;
686
1aa99e6b 687 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 688
653099ff
GS
689 if (SvTAIL(check)) {
690 slen = SvCUR(check); /* >= 1 */
cad2e5aa 691
9041c2e3 692 if ( strend - s > slen || strend - s < slen - 1
2c2d71f5 693 || (strend - s == slen && strend[-1] != '\n')) {
a3621e74 694 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
2c2d71f5 695 goto fail_finish;
cad2e5aa
JH
696 }
697 /* Now should match s[0..slen-2] */
698 slen--;
3f7c398e 699 if (slen && (*SvPVX_const(check) != *s
cad2e5aa 700 || (slen > 1
3f7c398e 701 && memNE(SvPVX_const(check), s, slen)))) {
2c2d71f5 702 report_neq:
a3621e74 703 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
2c2d71f5
JH
704 goto fail_finish;
705 }
cad2e5aa 706 }
3f7c398e 707 else if (*SvPVX_const(check) != *s
653099ff 708 || ((slen = SvCUR(check)) > 1
3f7c398e 709 && memNE(SvPVX_const(check), s, slen)))
2c2d71f5 710 goto report_neq;
c315bfe8 711 check_at = s;
2c2d71f5 712 goto success_at_start;
7e25d62c 713 }
cad2e5aa 714 }
2c2d71f5 715 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
cad2e5aa 716 s = strpos;
2c2d71f5 717 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
1de06328
YO
718 end_shift = prog->check_end_shift;
719
2c2d71f5 720 if (!ml_anch) {
a3b680e6 721 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
653099ff 722 - (SvTAIL(check) != 0);
a3b680e6 723 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
2c2d71f5
JH
724
725 if (end_shift < eshift)
726 end_shift = eshift;
727 }
cad2e5aa 728 }
2c2d71f5 729 else { /* Can match at random position */
cad2e5aa
JH
730 ml_anch = 0;
731 s = strpos;
1de06328
YO
732 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
733 end_shift = prog->check_end_shift;
734
735 /* end shift should be non negative here */
cad2e5aa
JH
736 }
737
bcdf7404 738#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 739 if (end_shift < 0)
1de06328 740 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 741 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
742#endif
743
2c2d71f5
JH
744 restart:
745 /* Find a possible match in the region s..strend by looking for
746 the "check" substring in the region corrected by start/end_shift. */
1de06328
YO
747
748 {
749 I32 srch_start_shift = start_shift;
750 I32 srch_end_shift = end_shift;
c33e64f0
FC
751 U8* start_point;
752 U8* end_point;
1de06328
YO
753 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
754 srch_end_shift -= ((strbeg - s) - srch_start_shift);
755 srch_start_shift = strbeg - s;
756 }
6bda09f9 757 DEBUG_OPTIMISE_MORE_r({
1de06328
YO
758 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
759 (IV)prog->check_offset_min,
760 (IV)srch_start_shift,
761 (IV)srch_end_shift,
762 (IV)prog->check_end_shift);
763 });
764
bbe252da 765 if (prog->extflags & RXf_CANY_SEEN) {
1de06328
YO
766 start_point= (U8*)(s + srch_start_shift);
767 end_point= (U8*)(strend - srch_end_shift);
768 } else {
769 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
770 end_point= HOP3(strend, -srch_end_shift, strbeg);
771 }
6bda09f9 772 DEBUG_OPTIMISE_MORE_r({
56570a2c 773 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
1de06328 774 (int)(end_point - start_point),
fc8cd66c 775 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
776 start_point);
777 });
778
779 s = fbm_instr( start_point, end_point,
7fba1cd6 780 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 781 }
cad2e5aa
JH
782 /* Update the count-of-usability, remove useless subpatterns,
783 unshift s. */
2c2d71f5 784
ab3bbdeb 785 DEBUG_EXECUTE_r({
f2ed9b32 786 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
787 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
788 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
2c2d71f5 789 (s ? "Found" : "Did not find"),
f2ed9b32 790 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
791 ? "anchored" : "floating"),
792 quoted,
793 RE_SV_TAIL(check),
794 (s ? " at offset " : "...\n") );
795 });
2c2d71f5
JH
796
797 if (!s)
798 goto fail_finish;
2c2d71f5 799 /* Finish the diagnostic message */
a3621e74 800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
2c2d71f5 801
1de06328
YO
802 /* XXX dmq: first branch is for positive lookbehind...
803 Our check string is offset from the beginning of the pattern.
804 So we need to do any stclass tests offset forward from that
805 point. I think. :-(
806 */
807
808
809
810 check_at=s;
811
812
2c2d71f5
JH
813 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
814 Start with the other substr.
815 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 816 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
817 *always* match. Probably should be marked during compile...
818 Probably it is right to do no SCREAM here...
819 */
820
f2ed9b32 821 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
1de06328
YO
822 : (prog->float_substr && prog->anchored_substr))
823 {
30944b6d 824 /* Take into account the "other" substring. */
2c2d71f5
JH
825 /* XXXX May be hopelessly wrong for UTF... */
826 if (!other_last)
6eb5f6b9 827 other_last = strpos;
f2ed9b32 828 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
30944b6d
IZ
829 do_other_anchored:
830 {
890ce7af
AL
831 char * const last = HOP3c(s, -start_shift, strbeg);
832 char *last1, *last2;
be8e71aa 833 char * const saved_s = s;
33b8afdf 834 SV* must;
2c2d71f5 835
2c2d71f5
JH
836 t = s - prog->check_offset_max;
837 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 838 && (!utf8_target
0ce71af7 839 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
2c2d71f5 840 && t > strpos)))
6f207bd3 841 NOOP;
2c2d71f5
JH
842 else
843 t = strpos;
1aa99e6b 844 t = HOP3c(t, prog->anchored_offset, strend);
6eb5f6b9
JH
845 if (t < other_last) /* These positions already checked */
846 t = other_last;
1aa99e6b 847 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
2c2d71f5
JH
848 if (last < last1)
849 last1 = last;
1de06328
YO
850 /* XXXX It is not documented what units *_offsets are in.
851 We assume bytes, but this is clearly wrong.
852 Meaning this code needs to be carefully reviewed for errors.
853 dmq.
854 */
855
2c2d71f5 856 /* On end-of-str: see comment below. */
f2ed9b32 857 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
33b8afdf
JH
858 if (must == &PL_sv_undef) {
859 s = (char*)NULL;
1de06328 860 DEBUG_r(must = prog->anchored_utf8); /* for debug */
33b8afdf
JH
861 }
862 else
863 s = fbm_instr(
864 (unsigned char*)t,
865 HOP3(HOP3(last1, prog->anchored_offset, strend)
866 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
867 must,
7fba1cd6 868 multiline ? FBMrf_MULTILINE : 0
33b8afdf 869 );
ab3bbdeb 870 DEBUG_EXECUTE_r({
f2ed9b32 871 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
872 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
873 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
2c2d71f5 874 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
875 quoted, RE_SV_TAIL(must));
876 });
877
878
2c2d71f5
JH
879 if (!s) {
880 if (last1 >= last2) {
a3621e74 881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5
JH
882 ", giving up...\n"));
883 goto fail_finish;
884 }
a3621e74 885 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2c2d71f5 886 ", trying floating at offset %ld...\n",
be8e71aa 887 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
1aa99e6b
IH
888 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
889 s = HOP3c(last, 1, strend);
2c2d71f5
JH
890 goto restart;
891 }
892 else {
a3621e74 893 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
30944b6d 894 (long)(s - i_strpos)));
1aa99e6b
IH
895 t = HOP3c(s, -prog->anchored_offset, strbeg);
896 other_last = HOP3c(s, 1, strend);
be8e71aa 897 s = saved_s;
2c2d71f5
JH
898 if (t == strpos)
899 goto try_at_start;
2c2d71f5
JH
900 goto try_at_offset;
901 }
30944b6d 902 }
2c2d71f5
JH
903 }
904 else { /* Take into account the floating substring. */
33b8afdf 905 char *last, *last1;
be8e71aa 906 char * const saved_s = s;
33b8afdf
JH
907 SV* must;
908
909 t = HOP3c(s, -start_shift, strbeg);
910 last1 = last =
911 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
912 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
913 last = HOP3c(t, prog->float_max_offset, strend);
914 s = HOP3c(t, prog->float_min_offset, strend);
915 if (s < other_last)
916 s = other_last;
2c2d71f5 917 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
f2ed9b32 918 must = utf8_target ? prog->float_utf8 : prog->float_substr;
33b8afdf
JH
919 /* fbm_instr() takes into account exact value of end-of-str
920 if the check is SvTAIL(ed). Since false positives are OK,
921 and end-of-str is not later than strend we are OK. */
922 if (must == &PL_sv_undef) {
923 s = (char*)NULL;
1de06328 924 DEBUG_r(must = prog->float_utf8); /* for debug message */
33b8afdf
JH
925 }
926 else
2c2d71f5 927 s = fbm_instr((unsigned char*)s,
33b8afdf
JH
928 (unsigned char*)last + SvCUR(must)
929 - (SvTAIL(must)!=0),
7fba1cd6 930 must, multiline ? FBMrf_MULTILINE : 0);
ab3bbdeb 931 DEBUG_EXECUTE_r({
f2ed9b32 932 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
933 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
934 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
33b8afdf 935 (s ? "Found" : "Contradicts"),
ab3bbdeb
YO
936 quoted, RE_SV_TAIL(must));
937 });
33b8afdf
JH
938 if (!s) {
939 if (last1 == last) {
a3621e74 940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf
JH
941 ", giving up...\n"));
942 goto fail_finish;
2c2d71f5 943 }
a3621e74 944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
33b8afdf 945 ", trying anchored starting at offset %ld...\n",
be8e71aa 946 (long)(saved_s + 1 - i_strpos)));
33b8afdf
JH
947 other_last = last;
948 s = HOP3c(t, 1, strend);
949 goto restart;
950 }
951 else {
a3621e74 952 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
33b8afdf
JH
953 (long)(s - i_strpos)));
954 other_last = s; /* Fix this later. --Hugo */
be8e71aa 955 s = saved_s;
33b8afdf
JH
956 if (t == strpos)
957 goto try_at_start;
958 goto try_at_offset;
959 }
2c2d71f5 960 }
cad2e5aa 961 }
2c2d71f5 962
1de06328 963
9ef43ace 964 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1de06328 965
6bda09f9 966 DEBUG_OPTIMISE_MORE_r(
1de06328
YO
967 PerlIO_printf(Perl_debug_log,
968 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
969 (IV)prog->check_offset_min,
970 (IV)prog->check_offset_max,
971 (IV)(s-strpos),
972 (IV)(t-strpos),
973 (IV)(t-s),
974 (IV)(strend-strpos)
975 )
976 );
977
2c2d71f5 978 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
f2ed9b32 979 && (!utf8_target
9ef43ace 980 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1de06328
YO
981 && t > strpos)))
982 {
2c2d71f5
JH
983 /* Fixed substring is found far enough so that the match
984 cannot start at strpos. */
985 try_at_offset:
cad2e5aa 986 if (ml_anch && t[-1] != '\n') {
30944b6d
IZ
987 /* Eventually fbm_*() should handle this, but often
988 anchored_offset is not 0, so this check will not be wasted. */
989 /* XXXX In the code below we prefer to look for "^" even in
990 presence of anchored substrings. And we search even
991 beyond the found float position. These pessimizations
992 are historical artefacts only. */
993 find_anchor:
2c2d71f5 994 while (t < strend - prog->minlen) {
cad2e5aa 995 if (*t == '\n') {
4ee3650e 996 if (t < check_at - prog->check_offset_min) {
f2ed9b32 997 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
4ee3650e
GS
998 /* Since we moved from the found position,
999 we definitely contradict the found anchored
30944b6d
IZ
1000 substr. Due to the above check we do not
1001 contradict "check" substr.
1002 Thus we can arrive here only if check substr
1003 is float. Redo checking for "other"=="fixed".
1004 */
9041c2e3 1005 strpos = t + 1;
a3621e74 1006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
e4584336 1007 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
30944b6d
IZ
1008 goto do_other_anchored;
1009 }
4ee3650e
GS
1010 /* We don't contradict the found floating substring. */
1011 /* XXXX Why not check for STCLASS? */
cad2e5aa 1012 s = t + 1;
a3621e74 1013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
e4584336 1014 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
cad2e5aa
JH
1015 goto set_useful;
1016 }
4ee3650e
GS
1017 /* Position contradicts check-string */
1018 /* XXXX probably better to look for check-string
1019 than for "\n", so one should lower the limit for t? */
a3621e74 1020 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
e4584336 1021 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
0e41cd87 1022 other_last = strpos = s = t + 1;
cad2e5aa
JH
1023 goto restart;
1024 }
1025 t++;
1026 }
a3621e74 1027 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
e4584336 1028 PL_colors[0], PL_colors[1]));
2c2d71f5 1029 goto fail_finish;
cad2e5aa 1030 }
f5952150 1031 else {
a3621e74 1032 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
e4584336 1033 PL_colors[0], PL_colors[1]));
f5952150 1034 }
cad2e5aa
JH
1035 s = t;
1036 set_useful:
f2ed9b32 1037 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1038 }
1039 else {
f5952150 1040 /* The found string does not prohibit matching at strpos,
2c2d71f5 1041 - no optimization of calling REx engine can be performed,
f5952150
GS
1042 unless it was an MBOL and we are not after MBOL,
1043 or a future STCLASS check will fail this. */
2c2d71f5
JH
1044 try_at_start:
1045 /* Even in this situation we may use MBOL flag if strpos is offset
1046 wrt the start of the string. */
05b4157f 1047 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
a1933d95 1048 && (strpos != strbeg) && strpos[-1] != '\n'
d506a20d 1049 /* May be due to an implicit anchor of m{.*foo} */
bbe252da 1050 && !(prog->intflags & PREGf_IMPLICIT))
d506a20d 1051 {
cad2e5aa
JH
1052 t = strpos;
1053 goto find_anchor;
1054 }
a3621e74 1055 DEBUG_EXECUTE_r( if (ml_anch)
f5952150 1056 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
70685ca0 1057 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
30944b6d 1058 );
2c2d71f5 1059 success_at_start:
bbe252da 1060 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1061 && (utf8_target ? (
33b8afdf
JH
1062 prog->check_utf8 /* Could be deleted already */
1063 && --BmUSEFUL(prog->check_utf8) < 0
1064 && (prog->check_utf8 == prog->float_utf8)
1065 ) : (
1066 prog->check_substr /* Could be deleted already */
1067 && --BmUSEFUL(prog->check_substr) < 0
1068 && (prog->check_substr == prog->float_substr)
1069 )))
66e933ab 1070 {
cad2e5aa 1071 /* If flags & SOMETHING - do not do it many times on the same match */
a3621e74 1072 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
f2ed9b32
KW
1073 /* XXX Does the destruction order has to change with utf8_target? */
1074 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1075 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1076 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1077 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1078 check = NULL; /* abort */
cad2e5aa 1079 s = strpos;
486ec47a 1080 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951
YO
1081 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1082 if (prog->intflags & PREGf_IMPLICIT)
1083 prog->extflags &= ~RXf_ANCH_MBOL;
3cf5c195
IZ
1084 /* XXXX This is a remnant of the old implementation. It
1085 looks wasteful, since now INTUIT can use many
6eb5f6b9 1086 other heuristics. */
bbe252da 1087 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1088 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa
JH
1089 }
1090 else
1091 s = strpos;
1092 }
1093
6eb5f6b9
JH
1094 /* Last resort... */
1095 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1096 /* trie stclasses are too expensive to use here, we are better off to
1097 leave it to regmatch itself */
f8fc2ecf 1098 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1099 /* minlen == 0 is possible if regstclass is \b or \B,
1100 and the fixed substr is ''$.
1101 Since minlen is already taken into account, s+1 is before strend;
1102 accidentally, minlen >= 1 guaranties no false positives at s + 1
1103 even for \b or \B. But (minlen? 1 : 0) below assumes that
1104 regstclass does not come from lookahead... */
1105 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1106 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf
YO
1107 const U8* const str = (U8*)STRING(progi->regstclass);
1108 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1109 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
66e933ab 1110 : 1);
1de06328
YO
1111 char * endpos;
1112 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1113 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1114 else if (prog->float_substr || prog->float_utf8)
1115 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1116 else
1117 endpos= strend;
1118
d8080198
YO
1119 if (checked_upto < s)
1120 checked_upto = s;
1121 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1122 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1123
6eb5f6b9 1124 t = s;
984e6dd1
DM
1125 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1126 NULL, is_utf8_pat);
d8080198
YO
1127 if (s) {
1128 checked_upto = s;
1129 } else {
6eb5f6b9 1130#ifdef DEBUGGING
cbbf8932 1131 const char *what = NULL;
6eb5f6b9
JH
1132#endif
1133 if (endpos == strend) {
a3621e74 1134 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6eb5f6b9
JH
1135 "Could not match STCLASS...\n") );
1136 goto fail;
1137 }
a3621e74 1138 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab 1139 "This position contradicts STCLASS...\n") );
bbe252da 1140 if ((prog->extflags & RXf_ANCH) && !ml_anch)
653099ff 1141 goto fail;
d8080198
YO
1142 checked_upto = HOPBACKc(endpos, start_shift);
1143 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1144 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1145 /* Contradict one of substrings */
33b8afdf 1146 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1147 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1148 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1149 hop_and_restart:
1aa99e6b 1150 s = HOP3c(t, 1, strend);
66e933ab
GS
1151 if (s + start_shift + end_shift > strend) {
1152 /* XXXX Should be taken into account earlier? */
a3621e74 1153 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
66e933ab
GS
1154 "Could not match STCLASS...\n") );
1155 goto fail;
1156 }
5e39e1e5
HS
1157 if (!check)
1158 goto giveup;
a3621e74 1159 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1160 "Looking for %s substr starting at offset %ld...\n",
6eb5f6b9
JH
1161 what, (long)(s + start_shift - i_strpos)) );
1162 goto restart;
1163 }
66e933ab 1164 /* Have both, check_string is floating */
6eb5f6b9
JH
1165 if (t + start_shift >= check_at) /* Contradicts floating=check */
1166 goto retry_floating_check;
1167 /* Recheck anchored substring, but not floating... */
9041c2e3 1168 s = check_at;
5e39e1e5
HS
1169 if (!check)
1170 goto giveup;
a3621e74 1171 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1172 "Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9
JH
1173 (long)(other_last - i_strpos)) );
1174 goto do_other_anchored;
1175 }
60e71179
GS
1176 /* Another way we could have checked stclass at the
1177 current position only: */
1178 if (ml_anch) {
1179 s = t = t + 1;
5e39e1e5
HS
1180 if (!check)
1181 goto giveup;
a3621e74 1182 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
f5952150 1183 "Looking for /%s^%s/m starting at offset %ld...\n",
e4584336 1184 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
60e71179 1185 goto try_at_offset;
66e933ab 1186 }
f2ed9b32 1187 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1188 goto fail;
486ec47a 1189 /* Check is floating substring. */
6eb5f6b9
JH
1190 retry_floating_check:
1191 t = check_at - start_shift;
a3621e74 1192 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1193 goto hop_and_restart;
1194 }
b7953727 1195 if (t != s) {
a3621e74 1196 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6eb5f6b9 1197 "By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1198 (long)(t - i_strpos), (long)(s - i_strpos))
1199 );
1200 }
1201 else {
a3621e74 1202 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
b7953727
JH
1203 "Does not contradict STCLASS...\n");
1204 );
1205 }
6eb5f6b9 1206 }
5e39e1e5 1207 giveup:
a3621e74 1208 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
5e39e1e5
HS
1209 PL_colors[4], (check ? "Guessed" : "Giving up"),
1210 PL_colors[5], (long)(s - i_strpos)) );
cad2e5aa 1211 return s;
2c2d71f5
JH
1212
1213 fail_finish: /* Substring not found */
33b8afdf 1214 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1215 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1216 fail:
a3621e74 1217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1218 PL_colors[4], PL_colors[5]));
bd61b366 1219 return NULL;
cad2e5aa 1220}
9661b544 1221
a0a388a1
YO
1222#define DECL_TRIE_TYPE(scan) \
1223 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
fab2782b
YO
1224 trie_type = ((scan->flags == EXACT) \
1225 ? (utf8_target ? trie_utf8 : trie_plain) \
1226 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1227
1228#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1229uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1230 STRLEN skiplen; \
1231 switch (trie_type) { \
1232 case trie_utf8_fold: \
1233 if ( foldlen>0 ) { \
1234 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1235 foldlen -= len; \
1236 uscan += len; \
1237 len=0; \
1238 } else { \
1239 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1240 len = UTF8SKIP(uc); \
1241 skiplen = UNISKIP( uvc ); \
1242 foldlen -= skiplen; \
1243 uscan = foldbuf + skiplen; \
1244 } \
1245 break; \
1246 case trie_latin_utf8_fold: \
1247 if ( foldlen>0 ) { \
1248 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1249 foldlen -= len; \
1250 uscan += len; \
1251 len=0; \
1252 } else { \
1253 len = 1; \
1254 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1255 skiplen = UNISKIP( uvc ); \
1256 foldlen -= skiplen; \
1257 uscan = foldbuf + skiplen; \
1258 } \
1259 break; \
1260 case trie_utf8: \
1261 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1262 break; \
1263 case trie_plain: \
1264 uvc = (UV)*uc; \
1265 len = 1; \
1266 } \
1267 if (uvc < 256) { \
1268 charid = trie->charmap[ uvc ]; \
1269 } \
1270 else { \
1271 charid = 0; \
1272 if (widecharmap) { \
1273 SV** const svpp = hv_fetch(widecharmap, \
1274 (char*)&uvc, sizeof(UV), 0); \
1275 if (svpp) \
1276 charid = (U16)SvIV(*svpp); \
1277 } \
1278 } \
4cadc6a9
YO
1279} STMT_END
1280
4cadc6a9
YO
1281#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1282STMT_START { \
1283 while (s <= e) { \
1284 if ( (CoNd) \
fac1af77 1285 && (ln == 1 || folder(s, pat_string, ln)) \
9a5a5549 1286 && (!reginfo || regtry(reginfo, &s)) ) \
4cadc6a9
YO
1287 goto got_it; \
1288 s++; \
1289 } \
1290} STMT_END
1291
1292#define REXEC_FBC_UTF8_SCAN(CoDe) \
1293STMT_START { \
9a902117 1294 while (s < strend) { \
4cadc6a9 1295 CoDe \
9a902117 1296 s += UTF8SKIP(s); \
4cadc6a9
YO
1297 } \
1298} STMT_END
1299
1300#define REXEC_FBC_SCAN(CoDe) \
1301STMT_START { \
1302 while (s < strend) { \
1303 CoDe \
1304 s++; \
1305 } \
1306} STMT_END
1307
1308#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1309REXEC_FBC_UTF8_SCAN( \
1310 if (CoNd) { \
7aee35ff 1311 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1312 goto got_it; \
1313 else \
1314 tmp = doevery; \
1315 } \
1316 else \
1317 tmp = 1; \
1318)
1319
1320#define REXEC_FBC_CLASS_SCAN(CoNd) \
1321REXEC_FBC_SCAN( \
1322 if (CoNd) { \
24b23f37 1323 if (tmp && (!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1324 goto got_it; \
1325 else \
1326 tmp = doevery; \
1327 } \
1328 else \
1329 tmp = 1; \
1330)
1331
1332#define REXEC_FBC_TRYIT \
24b23f37 1333if ((!reginfo || regtry(reginfo, &s))) \
4cadc6a9
YO
1334 goto got_it
1335
e1d1eefb 1336#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
f2ed9b32 1337 if (utf8_target) { \
e1d1eefb
YO
1338 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1339 } \
1340 else { \
1341 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1342 }
e1d1eefb 1343
786e8c11
YO
1344#define DUMP_EXEC_POS(li,s,doutf8) \
1345 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1346
cfaf538b
KW
1347
1348#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1349 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1350 tmp = TEST_NON_UTF8(tmp); \
1351 REXEC_FBC_UTF8_SCAN( \
1352 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1353 tmp = !tmp; \
1354 IF_SUCCESS; \
1355 } \
1356 else { \
1357 IF_FAIL; \
1358 } \
1359 ); \
1360
1361#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1362 if (s == PL_bostr) { \
1363 tmp = '\n'; \
1364 } \
1365 else { \
1366 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1367 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1368 } \
1369 tmp = TeSt1_UtF8; \
1370 LOAD_UTF8_CHARCLASS_ALNUM(); \
1371 REXEC_FBC_UTF8_SCAN( \
1372 if (tmp == ! (TeSt2_UtF8)) { \
1373 tmp = !tmp; \
1374 IF_SUCCESS; \
1375 } \
1376 else { \
1377 IF_FAIL; \
1378 } \
1379 ); \
1380
63ac0dad
KW
1381/* The only difference between the BOUND and NBOUND cases is that
1382 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1383 * NBOUND. This is accomplished by passing it in either the if or else clause,
1384 * with the other one being empty */
1385#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1386 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1387
1388#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1389 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1390
1391#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1392 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1393
1394#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1395 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1396
63ac0dad
KW
1397
1398/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1399 * be passed in completely with the variable name being tested, which isn't
1400 * such a clean interface, but this is easier to read than it was before. We
1401 * are looking for the boundary (or non-boundary between a word and non-word
1402 * character. The utf8 and non-utf8 cases have the same logic, but the details
1403 * must be different. Find the "wordness" of the character just prior to this
1404 * one, and compare it with the wordness of this one. If they differ, we have
1405 * a boundary. At the beginning of the string, pretend that the previous
1406 * character was a new-line */
cfaf538b 1407#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1408 if (utf8_target) { \
cfaf538b 1409 UTF8_CODE \
63ac0dad
KW
1410 } \
1411 else { /* Not utf8 */ \
1412 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1413 tmp = TEST_NON_UTF8(tmp); \
1414 REXEC_FBC_SCAN( \
1415 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1416 tmp = !tmp; \
1417 IF_SUCCESS; \
1418 } \
1419 else { \
1420 IF_FAIL; \
1421 } \
1422 ); \
1423 } \
1424 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1425 goto got_it;
1426
786e8c11
YO
1427/* We know what class REx starts with. Try to find this position... */
1428/* if reginfo is NULL, its a dryrun */
1429/* annoyingly all the vars in this routine have different names from their counterparts
1430 in regmatch. /grrr */
1431
3c3eec57 1432STATIC char *
07be1b83 1433S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
984e6dd1 1434 const char *strend, regmatch_info *reginfo, bool is_utf8_pat)
a687059c 1435{
73104a1b
KW
1436 dVAR;
1437 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1438 char *pat_string; /* The pattern's exactish string */
1439 char *pat_end; /* ptr to end char of pat_string */
1440 re_fold_t folder; /* Function for computing non-utf8 folds */
1441 const U8 *fold_array; /* array for folding ords < 256 */
1442 STRLEN ln;
1443 STRLEN lnc;
73104a1b
KW
1444 U8 c1;
1445 U8 c2;
1446 char *e;
1447 I32 tmp = 1; /* Scratch variable? */
1448 const bool utf8_target = PL_reg_match_utf8;
1449 UV utf8_fold_flags = 0;
3018b823
KW
1450 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1451 with a result inverts that result, as 0^1 =
1452 1 and 1^1 = 0 */
1453 _char_class_number classnum;
1454
73104a1b 1455 RXi_GET_DECL(prog,progi);
2f7f8cb1 1456
73104a1b 1457 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1458
73104a1b
KW
1459 /* We know what class it must start with. */
1460 switch (OP(c)) {
1461 case ANYOF:
9aa1e39f 1462 case ANYOF_SYNTHETIC:
954a2af6 1463 case ANYOF_WARN_SUPER:
73104a1b
KW
1464 if (utf8_target) {
1465 REXEC_FBC_UTF8_CLASS_SCAN(
1466 reginclass(prog, c, (U8*)s, utf8_target));
1467 }
1468 else {
1469 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1470 }
1471 break;
1472 case CANY:
1473 REXEC_FBC_SCAN(
1474 if (tmp && (!reginfo || regtry(reginfo, &s)))
1475 goto got_it;
1476 else
1477 tmp = doevery;
1478 );
1479 break;
1480
1481 case EXACTFA:
984e6dd1 1482 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1483 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1484 goto do_exactf_utf8;
1485 }
1486 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1487 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1488 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1489
73104a1b
KW
1490 case EXACTF:
1491 if (utf8_target) {
16d951b7 1492
73104a1b
KW
1493 /* regcomp.c already folded this if pattern is in UTF-8 */
1494 utf8_fold_flags = 0;
1495 goto do_exactf_utf8;
1496 }
1497 fold_array = PL_fold;
1498 folder = foldEQ;
1499 goto do_exactf_non_utf8;
1500
1501 case EXACTFL:
984e6dd1 1502 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1503 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1504 goto do_exactf_utf8;
1505 }
1506 fold_array = PL_fold_locale;
1507 folder = foldEQ_locale;
1508 goto do_exactf_non_utf8;
3c760661 1509
73104a1b 1510 case EXACTFU_SS:
984e6dd1 1511 if (is_utf8_pat) {
73104a1b
KW
1512 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1513 }
1514 goto do_exactf_utf8;
16d951b7 1515
73104a1b
KW
1516 case EXACTFU_TRICKYFOLD:
1517 case EXACTFU:
984e6dd1
DM
1518 if (is_utf8_pat || utf8_target) {
1519 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1520 goto do_exactf_utf8;
1521 }
fac1af77 1522
73104a1b
KW
1523 /* Any 'ss' in the pattern should have been replaced by regcomp,
1524 * so we don't have to worry here about this single special case
1525 * in the Latin1 range */
1526 fold_array = PL_fold_latin1;
1527 folder = foldEQ_latin1;
1528
1529 /* FALL THROUGH */
1530
1531 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1532 are no glitches with fold-length differences
1533 between the target string and pattern */
1534
1535 /* The idea in the non-utf8 EXACTF* cases is to first find the
1536 * first character of the EXACTF* node and then, if necessary,
1537 * case-insensitively compare the full text of the node. c1 is the
1538 * first character. c2 is its fold. This logic will not work for
1539 * Unicode semantics and the german sharp ss, which hence should
1540 * not be compiled into a node that gets here. */
1541 pat_string = STRING(c);
1542 ln = STR_LEN(c); /* length to match in octets/bytes */
1543
1544 /* We know that we have to match at least 'ln' bytes (which is the
1545 * same as characters, since not utf8). If we have to match 3
1546 * characters, and there are only 2 availabe, we know without
1547 * trying that it will fail; so don't start a match past the
1548 * required minimum number from the far end */
1549 e = HOP3c(strend, -((I32)ln), s);
1550
1551 if (!reginfo && e < s) {
1552 e = s; /* Due to minlen logic of intuit() */
1553 }
fac1af77 1554
73104a1b
KW
1555 c1 = *pat_string;
1556 c2 = fold_array[c1];
1557 if (c1 == c2) { /* If char and fold are the same */
1558 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1559 }
1560 else {
1561 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1562 }
1563 break;
fac1af77 1564
73104a1b
KW
1565 do_exactf_utf8:
1566 {
1567 unsigned expansion;
1568
1569 /* If one of the operands is in utf8, we can't use the simpler folding
1570 * above, due to the fact that many different characters can have the
1571 * same fold, or portion of a fold, or different- length fold */
1572 pat_string = STRING(c);
1573 ln = STR_LEN(c); /* length to match in octets/bytes */
1574 pat_end = pat_string + ln;
984e6dd1 1575 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1576 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1577 : ln;
1578
1579 /* We have 'lnc' characters to match in the pattern, but because of
1580 * multi-character folding, each character in the target can match
1581 * up to 3 characters (Unicode guarantees it will never exceed
1582 * this) if it is utf8-encoded; and up to 2 if not (based on the
1583 * fact that the Latin 1 folds are already determined, and the
1584 * only multi-char fold in that range is the sharp-s folding to
1585 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1586 * string character. Adjust lnc accordingly, rounding up, so that
1587 * if we need to match at least 4+1/3 chars, that really is 5. */
1588 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1589 lnc = (lnc + expansion - 1) / expansion;
1590
1591 /* As in the non-UTF8 case, if we have to match 3 characters, and
1592 * only 2 are left, it's guaranteed to fail, so don't start a
1593 * match that would require us to go beyond the end of the string
1594 */
1595 e = HOP3c(strend, -((I32)lnc), s);
1596
1597 if (!reginfo && e < s) {
1598 e = s; /* Due to minlen logic of intuit() */
1599 }
0658cdde 1600
73104a1b
KW
1601 /* XXX Note that we could recalculate e to stop the loop earlier,
1602 * as the worst case expansion above will rarely be met, and as we
1603 * go along we would usually find that e moves further to the left.
1604 * This would happen only after we reached the point in the loop
1605 * where if there were no expansion we should fail. Unclear if
1606 * worth the expense */
1607
1608 while (s <= e) {
1609 char *my_strend= (char *)strend;
1610 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1611 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
73104a1b
KW
1612 && (!reginfo || regtry(reginfo, &s)) )
1613 {
1614 goto got_it;
1615 }
1616 s += (utf8_target) ? UTF8SKIP(s) : 1;
1617 }
1618 break;
1619 }
1620 case BOUNDL:
272d35c9 1621 RXp_MATCH_TAINTED_on(prog);
73104a1b
KW
1622 FBC_BOUND(isALNUM_LC,
1623 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1624 isALNUM_LC_utf8((U8*)s));
1625 break;
1626 case NBOUNDL:
272d35c9 1627 RXp_MATCH_TAINTED_on(prog);
73104a1b
KW
1628 FBC_NBOUND(isALNUM_LC,
1629 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1630 isALNUM_LC_utf8((U8*)s));
1631 break;
1632 case BOUND:
1633 FBC_BOUND(isWORDCHAR,
1634 isALNUM_uni(tmp),
03940dc2 1635 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1636 break;
1637 case BOUNDA:
1638 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1639 isWORDCHAR_A(tmp),
1640 isWORDCHAR_A((U8*)s));
1641 break;
1642 case NBOUND:
1643 FBC_NBOUND(isWORDCHAR,
1644 isALNUM_uni(tmp),
03940dc2 1645 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1646 break;
1647 case NBOUNDA:
1648 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1649 isWORDCHAR_A(tmp),
1650 isWORDCHAR_A((U8*)s));
1651 break;
1652 case BOUNDU:
1653 FBC_BOUND(isWORDCHAR_L1,
1654 isALNUM_uni(tmp),
03940dc2 1655 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1656 break;
1657 case NBOUNDU:
1658 FBC_NBOUND(isWORDCHAR_L1,
1659 isALNUM_uni(tmp),
03940dc2 1660 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1661 break;
73104a1b
KW
1662 case LNBREAK:
1663 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1664 is_LNBREAK_latin1_safe(s, strend)
1665 );
1666 break;
3018b823
KW
1667
1668 /* The argument to all the POSIX node types is the class number to pass to
1669 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1670
1671 case NPOSIXL:
1672 to_complement = 1;
1673 /* FALLTHROUGH */
1674
1675 case POSIXL:
272d35c9 1676 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1677 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1678 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1679 break;
3018b823
KW
1680
1681 case NPOSIXD:
1682 to_complement = 1;
1683 /* FALLTHROUGH */
1684
1685 case POSIXD:
1686 if (utf8_target) {
1687 goto posix_utf8;
1688 }
1689 goto posixa;
1690
1691 case NPOSIXA:
1692 if (utf8_target) {
1693 /* The complement of something that matches only ASCII matches all
1694 * UTF-8 variant code points, plus everything in ASCII that isn't
1695 * in the class */
1696 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1697 || ! _generic_isCC_A(*s, FLAGS(c)));
1698 break;
1699 }
1700
1701 to_complement = 1;
1702 /* FALLTHROUGH */
1703
73104a1b 1704 case POSIXA:
3018b823 1705 posixa:
73104a1b 1706 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1707 * byte invariant character. */
1708 REXEC_FBC_CLASS_SCAN(
1709 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1710 break;
3018b823
KW
1711
1712 case NPOSIXU:
1713 to_complement = 1;
1714 /* FALLTHROUGH */
1715
1716 case POSIXU:
1717 if (! utf8_target) {
1718 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1719 FLAGS(c))));
1720 }
1721 else {
1722
1723 posix_utf8:
1724 classnum = (_char_class_number) FLAGS(c);
1725 if (classnum < _FIRST_NON_SWASH_CC) {
1726 while (s < strend) {
1727
1728 /* We avoid loading in the swash as long as possible, but
1729 * should we have to, we jump to a separate loop. This
1730 * extra 'if' statement is what keeps this code from being
1731 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1732 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1733 goto found_above_latin1;
1734 }
1735 if ((UTF8_IS_INVARIANT(*s)
1736 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1737 classnum)))
1738 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1739 && to_complement ^ cBOOL(
1740 _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1741 classnum))))
1742 {
1743 if (tmp && (!reginfo || regtry(reginfo, &s)))
1744 goto got_it;
1745 else {
1746 tmp = doevery;
1747 }
1748 }
1749 else {
1750 tmp = 1;
1751 }
1752 s += UTF8SKIP(s);
1753 }
1754 }
1755 else switch (classnum) { /* These classes are implemented as
1756 macros */
1757 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1758 revert the change of \v matching this */
1759 /* FALL THROUGH */
1760
1761 case _CC_ENUM_PSXSPC:
1762 REXEC_FBC_UTF8_CLASS_SCAN(
1763 to_complement ^ cBOOL(isSPACE_utf8(s)));
1764 break;
1765
1766 case _CC_ENUM_BLANK:
1767 REXEC_FBC_UTF8_CLASS_SCAN(
1768 to_complement ^ cBOOL(isBLANK_utf8(s)));
1769 break;
1770
1771 case _CC_ENUM_XDIGIT:
1772 REXEC_FBC_UTF8_CLASS_SCAN(
1773 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1774 break;
1775
1776 case _CC_ENUM_VERTSPACE:
1777 REXEC_FBC_UTF8_CLASS_SCAN(
1778 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1779 break;
1780
1781 case _CC_ENUM_CNTRL:
1782 REXEC_FBC_UTF8_CLASS_SCAN(
1783 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1784 break;
1785
1786 default:
1787 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1788 assert(0); /* NOTREACHED */
1789 }
1790 }
1791 break;
1792
1793 found_above_latin1: /* Here we have to load a swash to get the result
1794 for the current code point */
1795 if (! PL_utf8_swash_ptrs[classnum]) {
1796 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1797 PL_utf8_swash_ptrs[classnum] =
1798 _core_swash_init("utf8", swash_property_names[classnum],
1799 &PL_sv_undef, 1, 0, NULL, &flags);
1800 }
1801
1802 /* This is a copy of the loop above for swash classes, though using the
1803 * FBC macro instead of being expanded out. Since we've loaded the
1804 * swash, we don't have to check for that each time through the loop */
1805 REXEC_FBC_UTF8_CLASS_SCAN(
1806 to_complement ^ cBOOL(_generic_utf8(
1807 classnum,
1808 s,
1809 swash_fetch(PL_utf8_swash_ptrs[classnum],
1810 (U8 *) s, TRUE))));
73104a1b
KW
1811 break;
1812
1813 case AHOCORASICKC:
1814 case AHOCORASICK:
1815 {
1816 DECL_TRIE_TYPE(c);
1817 /* what trie are we using right now */
1818 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1819 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1820 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1821
1822 const char *last_start = strend - trie->minlen;
6148ee25 1823#ifdef DEBUGGING
73104a1b 1824 const char *real_start = s;
6148ee25 1825#endif
73104a1b
KW
1826 STRLEN maxlen = trie->maxlen;
1827 SV *sv_points;
1828 U8 **points; /* map of where we were in the input string
1829 when reading a given char. For ASCII this
1830 is unnecessary overhead as the relationship
1831 is always 1:1, but for Unicode, especially
1832 case folded Unicode this is not true. */
1833 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1834 U8 *bitmap=NULL;
1835
1836
1837 GET_RE_DEBUG_FLAGS_DECL;
1838
1839 /* We can't just allocate points here. We need to wrap it in
1840 * an SV so it gets freed properly if there is a croak while
1841 * running the match */
1842 ENTER;
1843 SAVETMPS;
1844 sv_points=newSV(maxlen * sizeof(U8 *));
1845 SvCUR_set(sv_points,
1846 maxlen * sizeof(U8 *));
1847 SvPOK_on(sv_points);
1848 sv_2mortal(sv_points);
1849 points=(U8**)SvPV_nolen(sv_points );
1850 if ( trie_type != trie_utf8_fold
1851 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1852 {
1853 if (trie->bitmap)
1854 bitmap=(U8*)trie->bitmap;
1855 else
1856 bitmap=(U8*)ANYOF_BITMAP(c);
1857 }
1858 /* this is the Aho-Corasick algorithm modified a touch
1859 to include special handling for long "unknown char" sequences.
1860 The basic idea being that we use AC as long as we are dealing
1861 with a possible matching char, when we encounter an unknown char
1862 (and we have not encountered an accepting state) we scan forward
1863 until we find a legal starting char.
1864 AC matching is basically that of trie matching, except that when
1865 we encounter a failing transition, we fall back to the current
1866 states "fail state", and try the current char again, a process
1867 we repeat until we reach the root state, state 1, or a legal
1868 transition. If we fail on the root state then we can either
1869 terminate if we have reached an accepting state previously, or
1870 restart the entire process from the beginning if we have not.
1871
1872 */
1873 while (s <= last_start) {
1874 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1875 U8 *uc = (U8*)s;
1876 U16 charid = 0;
1877 U32 base = 1;
1878 U32 state = 1;
1879 UV uvc = 0;
1880 STRLEN len = 0;
1881 STRLEN foldlen = 0;
1882 U8 *uscan = (U8*)NULL;
1883 U8 *leftmost = NULL;
1884#ifdef DEBUGGING
1885 U32 accepted_word= 0;
786e8c11 1886#endif
73104a1b
KW
1887 U32 pointpos = 0;
1888
1889 while ( state && uc <= (U8*)strend ) {
1890 int failed=0;
1891 U32 word = aho->states[ state ].wordnum;
1892
1893 if( state==1 ) {
1894 if ( bitmap ) {
1895 DEBUG_TRIE_EXECUTE_r(
1896 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1897 dump_exec_pos( (char *)uc, c, strend, real_start,
1898 (char *)uc, utf8_target );
1899 PerlIO_printf( Perl_debug_log,
1900 " Scanning for legal start char...\n");
1901 }
1902 );
1903 if (utf8_target) {
1904 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1905 uc += UTF8SKIP(uc);
1906 }
1907 } else {
1908 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1909 uc++;
1910 }
786e8c11 1911 }
73104a1b 1912 s= (char *)uc;
07be1b83 1913 }
73104a1b
KW
1914 if (uc >(U8*)last_start) break;
1915 }
1916
1917 if ( word ) {
1918 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1919 if (!leftmost || lpos < leftmost) {
1920 DEBUG_r(accepted_word=word);
1921 leftmost= lpos;
7016d6eb 1922 }
73104a1b 1923 if (base==0) break;
7016d6eb 1924
73104a1b
KW
1925 }
1926 points[pointpos++ % maxlen]= uc;
1927 if (foldlen || uc < (U8*)strend) {
1928 REXEC_TRIE_READ_CHAR(trie_type, trie,
1929 widecharmap, uc,
1930 uscan, len, uvc, charid, foldlen,
1931 foldbuf, uniflags);
1932 DEBUG_TRIE_EXECUTE_r({
1933 dump_exec_pos( (char *)uc, c, strend,
1934 real_start, s, utf8_target);
1935 PerlIO_printf(Perl_debug_log,
1936 " Charid:%3u CP:%4"UVxf" ",
1937 charid, uvc);
1938 });
1939 }
1940 else {
1941 len = 0;
1942 charid = 0;
1943 }
07be1b83 1944
73104a1b
KW
1945
1946 do {
6148ee25 1947#ifdef DEBUGGING
73104a1b 1948 word = aho->states[ state ].wordnum;
6148ee25 1949#endif
73104a1b
KW
1950 base = aho->states[ state ].trans.base;
1951
1952 DEBUG_TRIE_EXECUTE_r({
1953 if (failed)
1954 dump_exec_pos( (char *)uc, c, strend, real_start,
1955 s, utf8_target );
1956 PerlIO_printf( Perl_debug_log,
1957 "%sState: %4"UVxf", word=%"UVxf,
1958 failed ? " Fail transition to " : "",
1959 (UV)state, (UV)word);
1960 });
1961 if ( base ) {
1962 U32 tmp;
1963 I32 offset;
1964 if (charid &&
1965 ( ((offset = base + charid
1966 - 1 - trie->uniquecharcount)) >= 0)
1967 && ((U32)offset < trie->lasttrans)
1968 && trie->trans[offset].check == state
1969 && (tmp=trie->trans[offset].next))
1970 {
1971 DEBUG_TRIE_EXECUTE_r(
1972 PerlIO_printf( Perl_debug_log," - legal\n"));
1973 state = tmp;
1974 break;
07be1b83
YO
1975 }
1976 else {
786e8c11 1977 DEBUG_TRIE_EXECUTE_r(
73104a1b 1978 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 1979 failed = 1;
73104a1b 1980 state = aho->fail[state];
07be1b83 1981 }
07be1b83 1982 }
73104a1b
KW
1983 else {
1984 /* we must be accepting here */
1985 DEBUG_TRIE_EXECUTE_r(
1986 PerlIO_printf( Perl_debug_log," - accepting\n"));
1987 failed = 1;
1988 break;
786e8c11 1989 }
73104a1b
KW
1990 } while(state);
1991 uc += len;
1992 if (failed) {
1993 if (leftmost)
1994 break;
1995 if (!state) state = 1;
07be1b83 1996 }
73104a1b
KW
1997 }
1998 if ( aho->states[ state ].wordnum ) {
1999 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2000 if (!leftmost || lpos < leftmost) {
2001 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2002 leftmost = lpos;
07be1b83
YO
2003 }
2004 }
73104a1b
KW
2005 if (leftmost) {
2006 s = (char*)leftmost;
2007 DEBUG_TRIE_EXECUTE_r({
2008 PerlIO_printf(
2009 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2010 (UV)accepted_word, (IV)(s - real_start)
2011 );
2012 });
2013 if (!reginfo || regtry(reginfo, &s)) {
2014 FREETMPS;
2015 LEAVE;
2016 goto got_it;
2017 }
2018 s = HOPc(s,1);
2019 DEBUG_TRIE_EXECUTE_r({
2020 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2021 });
2022 } else {
2023 DEBUG_TRIE_EXECUTE_r(
2024 PerlIO_printf( Perl_debug_log,"No match.\n"));
2025 break;
2026 }
2027 }
2028 FREETMPS;
2029 LEAVE;
2030 }
2031 break;
2032 default:
2033 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2034 break;
2035 }
2036 return 0;
2037 got_it:
2038 return s;
6eb5f6b9
JH
2039}
2040
fae667d5 2041
6eb5f6b9
JH
2042/*
2043 - regexec_flags - match a regexp against a string
2044 */
2045I32
5aaab254 2046Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
6eb5f6b9 2047 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2048/* stringarg: the point in the string at which to begin matching */
2049/* strend: pointer to null at end of string */
2050/* strbeg: real beginning of string */
2051/* minend: end of match must be >= minend bytes after stringarg. */
2052/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2053 * itself is accessed via the pointers above */
2054/* data: May be used for some additional optimizations.
2055 Currently its only used, with a U32 cast, for transmitting
2056 the ganch offset when doing a /g match. This will change */
2057/* nosave: For optimizations. */
2058
6eb5f6b9 2059{
97aff369 2060 dVAR;
8d919b0a 2061 struct regexp *const prog = ReANY(rx);
5aaab254 2062 char *s;
eb578fdb 2063 regnode *c;
5aaab254 2064 char *startpos = stringarg;
6eb5f6b9
JH
2065 I32 minlen; /* must match at least this many chars */
2066 I32 dontbother = 0; /* how many characters not to try at end */
6eb5f6b9
JH
2067 I32 end_shift = 0; /* Same for the end. */ /* CC */
2068 I32 scream_pos = -1; /* Internal iterator of scream. */
ccac19ea 2069 char *scream_olds = NULL;
f2ed9b32 2070 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2071 I32 multiline;
f8fc2ecf 2072 RXi_GET_DECL(prog,progi);
3b0527fe 2073 regmatch_info reginfo; /* create some info to pass to regtry etc */
e9105d30 2074 regexp_paren_pair *swap = NULL;
a3621e74
YO
2075 GET_RE_DEBUG_FLAGS_DECL;
2076
7918f24d 2077 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2078 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2079
2080 /* Be paranoid... */
2081 if (prog == NULL || startpos == NULL) {
2082 Perl_croak(aTHX_ "NULL regexp parameter");
2083 return 0;
2084 }
2085
bbe252da 2086 multiline = prog->extflags & RXf_PMf_MULTILINE;
288b8c02 2087 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2757e526 2088
f2ed9b32 2089 RX_MATCH_UTF8_set(rx, utf8_target);
1de06328 2090 DEBUG_EXECUTE_r(
f2ed9b32 2091 debug_start_match(rx, utf8_target, startpos, strend,
1de06328
YO
2092 "Matching");
2093 );
bac06658 2094
6eb5f6b9 2095 minlen = prog->minlen;
1de06328
YO
2096
2097 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2098 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2099 "String too short [regexec_flags]...\n"));
2100 goto phooey;
1aa99e6b 2101 }
6eb5f6b9 2102
1de06328 2103
6eb5f6b9 2104 /* Check validity of program. */
f8fc2ecf 2105 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2106 Perl_croak(aTHX_ "corrupted regexp program");
2107 }
2108
272d35c9 2109 RX_MATCH_TAINTED_off(rx);
ed301438 2110 PL_reg_state.re_state_eval_setup_done = FALSE;
6eb5f6b9
JH
2111 PL_reg_maxiter = 0;
2112
984e6dd1 2113 reginfo.is_utf8_pat = cBOOL(RX_UTF8(rx));
39819bd9 2114 reginfo.warned = FALSE;
6eb5f6b9 2115 /* Mark beginning of line for ^ and lookbehind. */
3b0527fe 2116 reginfo.bol = startpos; /* XXX not used ??? */
6eb5f6b9 2117 PL_bostr = strbeg;
3b0527fe 2118 reginfo.sv = sv;
6eb5f6b9
JH
2119
2120 /* Mark end of line for $ (and such) */
2121 PL_regeol = strend;
2122
2123 /* see how far we have to get to not match where we matched before */
3b0527fe 2124 reginfo.till = startpos+minend;
6eb5f6b9 2125
6eb5f6b9
JH
2126 /* If there is a "must appear" string, look for it. */
2127 s = startpos;
2128
bbe252da 2129 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
6eb5f6b9 2130 MAGIC *mg;
2c296965 2131 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
58e23c8d 2132 reginfo.ganch = startpos + prog->gofs;
2c296965 2133 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2134 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2c296965 2135 } else if (sv && SvTYPE(sv) >= SVt_PVMG
6eb5f6b9 2136 && SvMAGIC(sv)
14befaf4
DM
2137 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2138 && mg->mg_len >= 0) {
3b0527fe 2139 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2c296965 2140 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2141 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2c296965 2142
bbe252da 2143 if (prog->extflags & RXf_ANCH_GPOS) {
3b0527fe 2144 if (s > reginfo.ganch)
6eb5f6b9 2145 goto phooey;
58e23c8d 2146 s = reginfo.ganch - prog->gofs;
2c296965 2147 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
ed549f2e 2148 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
c584a96e
YO
2149 if (s < strbeg)
2150 goto phooey;
6eb5f6b9
JH
2151 }
2152 }
58e23c8d 2153 else if (data) {
70685ca0 2154 reginfo.ganch = strbeg + PTR2UV(data);
2c296965
YO
2155 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2156 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2157
2158 } else { /* pos() not defined */
3b0527fe 2159 reginfo.ganch = strbeg;
2c296965
YO
2160 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2161 "GPOS: reginfo.ganch = strbeg\n"));
2162 }
6eb5f6b9 2163 }
288b8c02 2164 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2165 /* We have to be careful. If the previous successful match
2166 was from this regex we don't want a subsequent partially
2167 successful match to clobber the old results.
2168 So when we detect this possibility we add a swap buffer
d8da0584
KW
2169 to the re, and switch the buffer each match. If we fail,
2170 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2171 */
2172 swap = prog->offs;
2173 /* do we need a save destructor here for eval dies? */
2174 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2175 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2176 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2177 PTR2UV(prog),
2178 PTR2UV(swap),
2179 PTR2UV(prog->offs)
2180 ));
c74340f9 2181 }
a0714e2c 2182 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
6eb5f6b9
JH
2183 re_scream_pos_data d;
2184
2185 d.scream_olds = &scream_olds;
2186 d.scream_pos = &scream_pos;
288b8c02 2187 s = re_intuit_start(rx, sv, s, strend, flags, &d);
3fa9c3d7 2188 if (!s) {
a3621e74 2189 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
6eb5f6b9 2190 goto phooey; /* not present */
3fa9c3d7 2191 }
6eb5f6b9
JH
2192 }
2193
1de06328 2194
6eb5f6b9
JH
2195
2196 /* Simplest case: anchored match need be tried only once. */
2197 /* [unless only anchor is BOL and multiline is set] */
bbe252da 2198 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
24b23f37 2199 if (s == startpos && regtry(&reginfo, &startpos))
6eb5f6b9 2200 goto got_it;
bbe252da
YO
2201 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2202 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
6eb5f6b9
JH
2203 {
2204 char *end;
2205
2206 if (minlen)
2207 dontbother = minlen - 1;
1aa99e6b 2208 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2209 /* for multiline we only have to try after newlines */
33b8afdf 2210 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2211 /* because of the goto we can not easily reuse the macros for bifurcating the
2212 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2213 if (utf8_target) {
2214 if (s == startpos)
2215 goto after_try_utf8;
2216 while (1) {
2217 if (regtry(&reginfo, &s)) {
2218 goto got_it;
2219 }
2220 after_try_utf8:
2221 if (s > end) {
2222 goto phooey;
2223 }
2224 if (prog->extflags & RXf_USE_INTUIT) {
2225 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2226 if (!s) {
2227 goto phooey;
2228 }
2229 }
2230 else {
2231 s += UTF8SKIP(s);
2232 }
2233 }
2234 } /* end search for check string in unicode */
2235 else {
2236 if (s == startpos) {
2237 goto after_try_latin;
2238 }
2239 while (1) {
2240 if (regtry(&reginfo, &s)) {
2241 goto got_it;
2242 }
2243 after_try_latin:
2244 if (s > end) {
2245 goto phooey;
2246 }
2247 if (prog->extflags & RXf_USE_INTUIT) {
2248 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2249 if (!s) {
2250 goto phooey;
2251 }
2252 }
2253 else {
2254 s++;
2255 }
2256 }
2257 } /* end search for check string in latin*/
2258 } /* end search for check string */
2259 else { /* search for newline */
2260 if (s > startpos) {
2261 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2262 s--;
92f3d482 2263 }
21eede78
YO
2264 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2265 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2266 if (*s++ == '\n') { /* don't need PL_utf8skip here */
24b23f37 2267 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2268 goto got_it;
2269 }
92f3d482
YO
2270 }
2271 } /* end search for newline */
2272 } /* end anchored/multiline check string search */
6eb5f6b9 2273 goto phooey;
bbe252da 2274 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
f9f4320a 2275 {
486ec47a 2276 /* the warning about reginfo.ganch being used without initialization
bbe252da 2277 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
f9f4320a 2278 and we only enter this block when the same bit is set. */
58e23c8d 2279 char *tmp_s = reginfo.ganch - prog->gofs;
c584a96e
YO
2280
2281 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
6eb5f6b9
JH
2282 goto got_it;
2283 goto phooey;
2284 }
2285
2286 /* Messy cases: unanchored match. */
bbe252da 2287 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2288 /* we have /x+whatever/ */
984e6dd1 2289 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2290 char ch;
bf93d4cc
GS
2291#ifdef DEBUGGING
2292 int did_match = 0;
2293#endif
f2ed9b32 2294 if (utf8_target) {
7e0d5ad7
KW
2295 if (! prog->anchored_utf8) {
2296 to_utf8_substr(prog);
2297 }
2298 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2299 REXEC_FBC_SCAN(
6eb5f6b9 2300 if (*s == ch) {
a3621e74 2301 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2302 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2303 s += UTF8SKIP(s);
2304 while (s < strend && *s == ch)
2305 s += UTF8SKIP(s);
2306 }
4cadc6a9 2307 );
7e0d5ad7 2308
6eb5f6b9
JH
2309 }
2310 else {
7e0d5ad7
KW
2311 if (! prog->anchored_substr) {
2312 if (! to_byte_substr(prog)) {
6b54ddc5 2313 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2314 }
2315 }
2316 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2317 REXEC_FBC_SCAN(
6eb5f6b9 2318 if (*s == ch) {
a3621e74 2319 DEBUG_EXECUTE_r( did_match = 1 );
24b23f37 2320 if (regtry(&reginfo, &s)) goto got_it;
6eb5f6b9
JH
2321 s++;
2322 while (s < strend && *s == ch)
2323 s++;
2324 }
4cadc6a9 2325 );
6eb5f6b9 2326 }
a3621e74 2327 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2328 PerlIO_printf(Perl_debug_log,
b7953727
JH
2329 "Did not find anchored character...\n")
2330 );
6eb5f6b9 2331 }
a0714e2c
SS
2332 else if (prog->anchored_substr != NULL
2333 || prog->anchored_utf8 != NULL
2334 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2335 && prog->float_max_offset < strend - s)) {
2336 SV *must;
2337 I32 back_max;
2338 I32 back_min;
2339 char *last;
6eb5f6b9 2340 char *last1; /* Last position checked before */
bf93d4cc
GS
2341#ifdef DEBUGGING
2342 int did_match = 0;
2343#endif
33b8afdf 2344 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2345 if (utf8_target) {
2346 if (! prog->anchored_utf8) {
2347 to_utf8_substr(prog);
2348 }
2349 must = prog->anchored_utf8;
2350 }
2351 else {
2352 if (! prog->anchored_substr) {
2353 if (! to_byte_substr(prog)) {
6b54ddc5 2354 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2355 }
2356 }
2357 must = prog->anchored_substr;
2358 }
33b8afdf
JH
2359 back_max = back_min = prog->anchored_offset;
2360 } else {
7e0d5ad7
KW
2361 if (utf8_target) {
2362 if (! prog->float_utf8) {
2363 to_utf8_substr(prog);
2364 }
2365 must = prog->float_utf8;
2366 }
2367 else {
2368 if (! prog->float_substr) {
2369 if (! to_byte_substr(prog)) {
6b54ddc5 2370 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2371 }
2372 }
2373 must = prog->float_substr;
2374 }
33b8afdf
JH
2375 back_max = prog->float_max_offset;
2376 back_min = prog->float_min_offset;
2377 }
1de06328 2378
1de06328
YO
2379 if (back_min<0) {
2380 last = strend;
2381 } else {
2382 last = HOP3c(strend, /* Cannot start after this */
2383 -(I32)(CHR_SVLEN(must)
2384 - (SvTAIL(must) != 0) + back_min), strbeg);
2385 }
6eb5f6b9
JH
2386 if (s > PL_bostr)
2387 last1 = HOPc(s, -1);
2388 else
2389 last1 = s - 1; /* bogus */
2390
a0288114 2391 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9
JH
2392 check_substr==must. */
2393 scream_pos = -1;
2394 dontbother = end_shift;
2395 strend = HOPc(strend, -dontbother);
2396 while ( (s <= last) &&
c33e64f0 2397 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
9041c2e3 2398 (unsigned char*)strend, must,
c33e64f0 2399 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2400 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2401 if (HOPc(s, -back_max) > last1) {
2402 last1 = HOPc(s, -back_min);
2403 s = HOPc(s, -back_max);
2404 }
2405 else {
52657f30 2406 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2407
2408 last1 = HOPc(s, -back_min);
52657f30 2409 s = t;
6eb5f6b9 2410 }
f2ed9b32 2411 if (utf8_target) {
6eb5f6b9 2412 while (s <= last1) {
24b23f37 2413 if (regtry(&reginfo, &s))
6eb5f6b9 2414 goto got_it;
7016d6eb
DM
2415 if (s >= last1) {
2416 s++; /* to break out of outer loop */
2417 break;
2418 }
2419 s += UTF8SKIP(s);
6eb5f6b9
JH
2420 }
2421 }
2422 else {
2423 while (s <= last1) {
24b23f37 2424 if (regtry(&reginfo, &s))
6eb5f6b9
JH
2425 goto got_it;
2426 s++;
2427 }
2428 }
2429 }
ab3bbdeb 2430 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2431 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2432 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2433 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2434 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2435 ? "anchored" : "floating"),
ab3bbdeb
YO
2436 quoted, RE_SV_TAIL(must));
2437 });
6eb5f6b9
JH
2438 goto phooey;
2439 }
f8fc2ecf 2440 else if ( (c = progi->regstclass) ) {
f14c76ed 2441 if (minlen) {
f8fc2ecf 2442 const OPCODE op = OP(progi->regstclass);
66e933ab 2443 /* don't bother with what can't match */
786e8c11 2444 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2445 strend = HOPc(strend, -(minlen - 1));
2446 }
a3621e74 2447 DEBUG_EXECUTE_r({
be8e71aa 2448 SV * const prop = sv_newmortal();
32fc9b6a 2449 regprop(prog, prop, c);
0df25f3d 2450 {
f2ed9b32 2451 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2452 s,strend-s,60);
0df25f3d 2453 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2454 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2455 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2456 quoted, (int)(strend - s));
0df25f3d 2457 }
ffc61ed2 2458 });
984e6dd1 2459 if (find_byclass(prog, c, s, strend, &reginfo, reginfo.is_utf8_pat))
6eb5f6b9 2460 goto got_it;
07be1b83 2461 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2462 }
2463 else {
2464 dontbother = 0;
a0714e2c 2465 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2466 /* Trim the end. */
6af40bd7 2467 char *last= NULL;
33b8afdf 2468 SV* float_real;
c33e64f0
FC
2469 STRLEN len;
2470 const char *little;
33b8afdf 2471
7e0d5ad7
KW
2472 if (utf8_target) {
2473 if (! prog->float_utf8) {
2474 to_utf8_substr(prog);
2475 }
2476 float_real = prog->float_utf8;
2477 }
2478 else {
2479 if (! prog->float_substr) {
2480 if (! to_byte_substr(prog)) {
6b54ddc5 2481 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2482 }
2483 }
2484 float_real = prog->float_substr;
2485 }
d6a28714 2486
c33e64f0
FC
2487 little = SvPV_const(float_real, len);
2488 if (SvTAIL(float_real)) {
7f18ad16
KW
2489 /* This means that float_real contains an artificial \n on
2490 * the end due to the presence of something like this:
2491 * /foo$/ where we can match both "foo" and "foo\n" at the
2492 * end of the string. So we have to compare the end of the
2493 * string first against the float_real without the \n and
2494 * then against the full float_real with the string. We
2495 * have to watch out for cases where the string might be
2496 * smaller than the float_real or the float_real without
2497 * the \n. */
1a13b075
YO
2498 char *checkpos= strend - len;
2499 DEBUG_OPTIMISE_r(
2500 PerlIO_printf(Perl_debug_log,
2501 "%sChecking for float_real.%s\n",
2502 PL_colors[4], PL_colors[5]));
2503 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2504 /* can't match, even if we remove the trailing \n
2505 * string is too short to match */
1a13b075
YO
2506 DEBUG_EXECUTE_r(
2507 PerlIO_printf(Perl_debug_log,
2508 "%sString shorter than required trailing substring, cannot match.%s\n",
2509 PL_colors[4], PL_colors[5]));
2510 goto phooey;
2511 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2512 /* can match, the end of the string matches without the
2513 * "\n" */
1a13b075
YO
2514 last = checkpos + 1;
2515 } else if (checkpos < strbeg) {
7f18ad16
KW
2516 /* cant match, string is too short when the "\n" is
2517 * included */
1a13b075
YO
2518 DEBUG_EXECUTE_r(
2519 PerlIO_printf(Perl_debug_log,
2520 "%sString does not contain required trailing substring, cannot match.%s\n",
2521 PL_colors[4], PL_colors[5]));
2522 goto phooey;
2523 } else if (!multiline) {
7f18ad16
KW
2524 /* non multiline match, so compare with the "\n" at the
2525 * end of the string */
1a13b075
YO
2526 if (memEQ(checkpos, little, len)) {
2527 last= checkpos;
2528 } else {
2529 DEBUG_EXECUTE_r(
2530 PerlIO_printf(Perl_debug_log,
2531 "%sString does not contain required trailing substring, cannot match.%s\n",
2532 PL_colors[4], PL_colors[5]));
2533 goto phooey;
2534 }
2535 } else {
7f18ad16
KW
2536 /* multiline match, so we have to search for a place
2537 * where the full string is located */
d6a28714 2538 goto find_last;
1a13b075 2539 }
c33e64f0 2540 } else {
d6a28714 2541 find_last:
9041c2e3 2542 if (len)
d6a28714 2543 last = rninstr(s, strend, little, little + len);
b8c5462f 2544 else
a0288114 2545 last = strend; /* matching "$" */
b8c5462f 2546 }
6af40bd7 2547 if (!last) {
7f18ad16
KW
2548 /* at one point this block contained a comment which was
2549 * probably incorrect, which said that this was a "should not
2550 * happen" case. Even if it was true when it was written I am
2551 * pretty sure it is not anymore, so I have removed the comment
2552 * and replaced it with this one. Yves */
6bda09f9
YO
2553 DEBUG_EXECUTE_r(
2554 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2555 "String does not contain required substring, cannot match.\n"
2556 ));
2557 goto phooey;
bf93d4cc 2558 }
d6a28714
JH
2559 dontbother = strend - last + prog->float_min_offset;
2560 }
2561 if (minlen && (dontbother < minlen))
2562 dontbother = minlen - 1;
2563 strend -= dontbother; /* this one's always in bytes! */
2564 /* We don't know much -- general case. */
f2ed9b32 2565 if (utf8_target) {
d6a28714 2566 for (;;) {
24b23f37 2567 if (regtry(&reginfo, &s))
d6a28714
JH
2568 goto got_it;
2569 if (s >= strend)
2570 break;
b8c5462f 2571 s += UTF8SKIP(s);
d6a28714
JH
2572 };
2573 }
2574 else {
2575 do {
24b23f37 2576 if (regtry(&reginfo, &s))
d6a28714
JH
2577 goto got_it;
2578 } while (s++ < strend);
2579 }
2580 }
2581
2582 /* Failure. */
2583 goto phooey;
2584
2585got_it:
495f47a5
DM
2586 DEBUG_BUFFERS_r(
2587 if (swap)
2588 PerlIO_printf(Perl_debug_log,
2589 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2590 PTR2UV(prog),
2591 PTR2UV(swap)
2592 );
2593 );
e9105d30 2594 Safefree(swap);
d6a28714 2595
ed301438 2596 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2597 restore_pos(aTHX_ prog);
5daac39c
NC
2598 if (RXp_PAREN_NAMES(prog))
2599 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
2600
2601 /* make sure $`, $&, $', and $digit will work later */
2602 if ( !(flags & REXEC_NOT_FIRST) ) {
d6a28714 2603 if (flags & REXEC_COPY_STR) {
db2c6cb3
FC
2604#ifdef PERL_ANY_COW
2605 if (SvCANCOW(sv)) {
ed252734
NC
2606 if (DEBUG_C_TEST) {
2607 PerlIO_printf(Perl_debug_log,
2608 "Copy on write: regexp capture, type %d\n",
2609 (int) SvTYPE(sv));
2610 }
77f8f7c1 2611 RX_MATCH_COPY_FREE(rx);
ed252734 2612 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
d5263905 2613 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
ed252734 2614 assert (SvPOKp(prog->saved_copy));
6502e081
DM
2615 prog->sublen = PL_regeol - strbeg;
2616 prog->suboffset = 0;
2617 prog->subcoffset = 0;
ed252734
NC
2618 } else
2619#endif
2620 {
6502e081
DM
2621 I32 min = 0;
2622 I32 max = PL_regeol - strbeg;
2623 I32 sublen;
2624
2625 if ( (flags & REXEC_COPY_SKIP_POST)
2626 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2627 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2628 ) { /* don't copy $' part of string */
3de645a8 2629 U32 n = 0;
6502e081
DM
2630 max = -1;
2631 /* calculate the right-most part of the string covered
2632 * by a capture. Due to look-ahead, this may be to
2633 * the right of $&, so we have to scan all captures */
2634 while (n <= prog->lastparen) {
2635 if (prog->offs[n].end > max)
2636 max = prog->offs[n].end;
2637 n++;
2638 }
2639 if (max == -1)
2640 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2641 ? prog->offs[0].start
2642 : 0;
2643 assert(max >= 0 && max <= PL_regeol - strbeg);
2644 }
2645
2646 if ( (flags & REXEC_COPY_SKIP_PRE)
2647 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2648 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2649 ) { /* don't copy $` part of string */
3de645a8 2650 U32 n = 0;
6502e081
DM
2651 min = max;
2652 /* calculate the left-most part of the string covered
2653 * by a capture. Due to look-behind, this may be to
2654 * the left of $&, so we have to scan all captures */
2655 while (min && n <= prog->lastparen) {
2656 if ( prog->offs[n].start != -1
2657 && prog->offs[n].start < min)
2658 {
2659 min = prog->offs[n].start;
2660 }
2661 n++;
2662 }
2663 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2664 && min > prog->offs[0].end
2665 )
2666 min = prog->offs[0].end;
2667
2668 }
2669
2670 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2671 sublen = max - min;
2672
2673 if (RX_MATCH_COPIED(rx)) {
2674 if (sublen > prog->sublen)
2675 prog->subbeg =
2676 (char*)saferealloc(prog->subbeg, sublen+1);
2677 }
2678 else
2679 prog->subbeg = (char*)safemalloc(sublen+1);
2680 Copy(strbeg + min, prog->subbeg, sublen, char);
2681 prog->subbeg[sublen] = '\0';
2682 prog->suboffset = min;
2683 prog->sublen = sublen;
77f8f7c1 2684 RX_MATCH_COPIED_on(rx);
6502e081 2685 }
6502e081
DM
2686 prog->subcoffset = prog->suboffset;
2687 if (prog->suboffset && utf8_target) {
2688 /* Convert byte offset to chars.
2689 * XXX ideally should only compute this if @-/@+
2690 * has been seen, a la PL_sawampersand ??? */
2691
2692 /* If there's a direct correspondence between the
2693 * string which we're matching and the original SV,
2694 * then we can use the utf8 len cache associated with
2695 * the SV. In particular, it means that under //g,
2696 * sv_pos_b2u() will use the previously cached
2697 * position to speed up working out the new length of
2698 * subcoffset, rather than counting from the start of
2699 * the string each time. This stops
2700 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2701 * from going quadratic */
2702 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2703 sv_pos_b2u(sv, &(prog->subcoffset));
2704 else
2705 prog->subcoffset = utf8_length((U8*)strbeg,
2706 (U8*)(strbeg+prog->suboffset));
2707 }
d6a28714
JH
2708 }
2709 else {
6502e081 2710 RX_MATCH_COPY_FREE(rx);
d6a28714 2711 prog->subbeg = strbeg;
6502e081
DM
2712 prog->suboffset = 0;
2713 prog->subcoffset = 0;
d6a28714
JH
2714 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2715 }
2716 }
9041c2e3 2717
d6a28714
JH
2718 return 1;
2719
2720phooey:
a3621e74 2721 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 2722 PL_colors[4], PL_colors[5]));
ed301438 2723 if (PL_reg_state.re_state_eval_setup_done)
4f639d21 2724 restore_pos(aTHX_ prog);
e9105d30 2725 if (swap) {
c74340f9 2726 /* we failed :-( roll it back */
495f47a5
DM
2727 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2728 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2729 PTR2UV(prog),
2730 PTR2UV(prog->offs),
2731 PTR2UV(swap)
2732 ));
e9105d30
GG
2733 Safefree(prog->offs);
2734 prog->offs = swap;
2735 }
d6a28714
JH
2736 return 0;
2737}
2738
6bda09f9 2739
ec43f78b
DM
2740/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2741 * Do inc before dec, in case old and new rex are the same */
2742#define SET_reg_curpm(Re2) \
2743 if (PL_reg_state.re_state_eval_setup_done) { \
2744 (void)ReREFCNT_inc(Re2); \
2745 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2746 PM_SETRE((PL_reg_curpm), (Re2)); \
2747 }
2748
2749
d6a28714
JH
2750/*
2751 - regtry - try match at specific point
2752 */
2753STATIC I32 /* 0 failure, 1 success */
f73aaa43 2754S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 2755{
97aff369 2756 dVAR;
d6a28714 2757 CHECKPOINT lastcp;
288b8c02 2758 REGEXP *const rx = reginfo->prog;
8d919b0a 2759 regexp *const prog = ReANY(rx);
f73aaa43 2760 I32 result;
f8fc2ecf 2761 RXi_GET_DECL(prog,progi);
a3621e74 2762 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2763
2764 PERL_ARGS_ASSERT_REGTRY;
2765
24b23f37 2766 reginfo->cutpoint=NULL;
d6a28714 2767
ed301438
DM
2768 if ((prog->extflags & RXf_EVAL_SEEN)
2769 && !PL_reg_state.re_state_eval_setup_done)
2770 {
d6a28714
JH
2771 MAGIC *mg;
2772
ed301438 2773 PL_reg_state.re_state_eval_setup_done = TRUE;
3b0527fe 2774 if (reginfo->sv) {
d6a28714 2775 /* Make $_ available to executed code. */
3b0527fe 2776 if (reginfo->sv != DEFSV) {
59f00321 2777 SAVE_DEFSV;
414bf5ae 2778 DEFSV_set(reginfo->sv);
b8c5462f 2779 }
d6a28714 2780
3b0527fe
DM
2781 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2782 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
d6a28714 2783 /* prepare for quick setting of pos */
d300d9fa 2784#ifdef PERL_OLD_COPY_ON_WRITE
51a9ea20
NC
2785 if (SvIsCOW(reginfo->sv))
2786 sv_force_normal_flags(reginfo->sv, 0);
d300d9fa 2787#endif
3dab1dad 2788 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
d300d9fa 2789 &PL_vtbl_mglob, NULL, 0);
d6a28714 2790 mg->mg_len = -1;
b8c5462f 2791 }
d6a28714
JH
2792 PL_reg_magic = mg;
2793 PL_reg_oldpos = mg->mg_len;
4f639d21 2794 SAVEDESTRUCTOR_X(restore_pos, prog);
d6a28714 2795 }
09687e5a 2796 if (!PL_reg_curpm) {
a02a5408 2797 Newxz(PL_reg_curpm, 1, PMOP);
09687e5a
AB
2798#ifdef USE_ITHREADS
2799 {
14a49a24 2800 SV* const repointer = &PL_sv_undef;
92313705
NC
2801 /* this regexp is also owned by the new PL_reg_curpm, which
2802 will try to free it. */
d2ece331 2803 av_push(PL_regex_padav, repointer);
09687e5a
AB
2804 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2805 PL_regex_pad = AvARRAY(PL_regex_padav);
2806 }
2807#endif
2808 }
ec43f78b 2809 SET_reg_curpm(rx);
d6a28714
JH
2810 PL_reg_oldcurpm = PL_curpm;
2811 PL_curpm = PL_reg_curpm;
07bc277f 2812 if (RXp_MATCH_COPIED(prog)) {
d6a28714
JH
2813 /* Here is a serious problem: we cannot rewrite subbeg,
2814 since it may be needed if this match fails. Thus
2815 $` inside (?{}) could fail... */
2816 PL_reg_oldsaved = prog->subbeg;
2817 PL_reg_oldsavedlen = prog->sublen;
6502e081
DM
2818 PL_reg_oldsavedoffset = prog->suboffset;
2819 PL_reg_oldsavedcoffset = prog->suboffset;
db2c6cb3 2820#ifdef PERL_ANY_COW
ed252734
NC
2821 PL_nrs = prog->saved_copy;
2822#endif
07bc277f 2823 RXp_MATCH_COPIED_off(prog);
d6a28714
JH
2824 }
2825 else
bd61b366 2826 PL_reg_oldsaved = NULL;
d6a28714 2827 prog->subbeg = PL_bostr;
6502e081
DM
2828 prog->suboffset = 0;
2829 prog->subcoffset = 0;
d6a28714
JH
2830 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2831 }
97ca13b7 2832#ifdef DEBUGGING
f73aaa43 2833 PL_reg_starttry = *startposp;
97ca13b7 2834#endif
f73aaa43 2835 prog->offs[0].start = *startposp - PL_bostr;
d6a28714 2836 prog->lastparen = 0;
03994de8 2837 prog->lastcloseparen = 0;
d6a28714
JH
2838
2839 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 2840 to do this again and again, prog->lastparen should take care of
3dd2943c 2841 this! --ilya*/
dafc8851
JH
2842
2843 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2844 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 2845 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
2846 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2847 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2848 * Meanwhile, this code *is* needed for the
daf18116
JH
2849 * above-mentioned test suite tests to succeed. The common theme
2850 * on those tests seems to be returning null fields from matches.
225593e1 2851 * --jhi updated by dapm */
dafc8851 2852#if 1
d6a28714 2853 if (prog->nparens) {
b93070ed 2854 regexp_paren_pair *pp = prog->offs;
eb578fdb 2855 I32 i;
b93070ed 2856 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
2857 ++pp;
2858 pp->start = -1;
2859 pp->end = -1;
d6a28714
JH
2860 }
2861 }
dafc8851 2862#endif
02db2b7b 2863 REGCP_SET(lastcp);
f73aaa43
DM
2864 result = regmatch(reginfo, *startposp, progi->program + 1);
2865 if (result != -1) {
2866 prog->offs[0].end = result;
d6a28714
JH
2867 return 1;
2868 }
24b23f37 2869 if (reginfo->cutpoint)
f73aaa43 2870 *startposp= reginfo->cutpoint;
02db2b7b 2871 REGCP_UNWIND(lastcp);
d6a28714
JH
2872 return 0;
2873}
2874
02db2b7b 2875
8ba1375e
MJD
2876#define sayYES goto yes
2877#define sayNO goto no
262b90c4 2878#define sayNO_SILENT goto no_silent
8ba1375e 2879
f9f4320a
YO
2880/* we dont use STMT_START/END here because it leads to
2881 "unreachable code" warnings, which are bogus, but distracting. */
2882#define CACHEsayNO \
c476f425
DM
2883 if (ST.cache_mask) \
2884 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 2885 sayNO
3298f257 2886
a3621e74 2887/* this is used to determine how far from the left messages like
265c4333
YO
2888 'failed...' are printed. It should be set such that messages
2889 are inline with the regop output that created them.
a3621e74 2890*/
265c4333 2891#define REPORT_CODE_OFF 32
a3621e74
YO
2892
2893
40a82448
DM
2894#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2895#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
2896#define CHRTEST_NOT_A_CP_1 -999
2897#define CHRTEST_NOT_A_CP_2 -998
9e137952 2898
86545054
DM
2899#define SLAB_FIRST(s) (&(s)->states[0])
2900#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2901
5d9a96ca
DM
2902/* grab a new slab and return the first slot in it */
2903
2904STATIC regmatch_state *
2905S_push_slab(pTHX)
2906{
a35a87e7 2907#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
2908 dMY_CXT;
2909#endif
5d9a96ca
DM
2910 regmatch_slab *s = PL_regmatch_slab->next;
2911 if (!s) {
2912 Newx(s, 1, regmatch_slab);
2913 s->prev = PL_regmatch_slab;
2914 s->next = NULL;
2915 PL_regmatch_slab->next = s;
2916 }
2917 PL_regmatch_slab = s;
86545054 2918 return SLAB_FIRST(s);
5d9a96ca 2919}
5b47454d 2920
95b24440 2921
40a82448
DM
2922/* push a new state then goto it */
2923
4d5016e5
DM
2924#define PUSH_STATE_GOTO(state, node, input) \
2925 pushinput = input; \
40a82448
DM
2926 scan = node; \
2927 st->resume_state = state; \
2928 goto push_state;
2929
2930/* push a new state with success backtracking, then goto it */
2931
4d5016e5
DM
2932#define PUSH_YES_STATE_GOTO(state, node, input) \
2933 pushinput = input; \
40a82448
DM
2934 scan = node; \
2935 st->resume_state = state; \
2936 goto push_yes_state;
2937
aa283a38 2938
aa283a38 2939
4d5016e5 2940
d6a28714 2941/*
95b24440 2942
bf1f174e
DM
2943regmatch() - main matching routine
2944
2945This is basically one big switch statement in a loop. We execute an op,
2946set 'next' to point the next op, and continue. If we come to a point which
2947we may need to backtrack to on failure such as (A|B|C), we push a
2948backtrack state onto the backtrack stack. On failure, we pop the top
2949state, and re-enter the loop at the state indicated. If there are no more
2950states to pop, we return failure.
2951
2952Sometimes we also need to backtrack on success; for example /A+/, where
2953after successfully matching one A, we need to go back and try to
2954match another one; similarly for lookahead assertions: if the assertion
2955completes successfully, we backtrack to the state just before the assertion
2956and then carry on. In these cases, the pushed state is marked as
2957'backtrack on success too'. This marking is in fact done by a chain of
2958pointers, each pointing to the previous 'yes' state. On success, we pop to
2959the nearest yes state, discarding any intermediate failure-only states.
2960Sometimes a yes state is pushed just to force some cleanup code to be
2961called at the end of a successful match or submatch; e.g. (??{$re}) uses
2962it to free the inner regex.
2963
2964Note that failure backtracking rewinds the cursor position, while
2965success backtracking leaves it alone.
2966
2967A pattern is complete when the END op is executed, while a subpattern
2968such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2969ops trigger the "pop to last yes state if any, otherwise return true"
2970behaviour.
2971
2972A common convention in this function is to use A and B to refer to the two
2973subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2974the subpattern to be matched possibly multiple times, while B is the entire
2975rest of the pattern. Variable and state names reflect this convention.
2976
2977The states in the main switch are the union of ops and failure/success of
2978substates associated with with that op. For example, IFMATCH is the op
2979that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2980'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2981successfully matched A and IFMATCH_A_fail is a state saying that we have
2982just failed to match A. Resume states always come in pairs. The backtrack
2983state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2984at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2985on success or failure.
2986
2987The struct that holds a backtracking state is actually a big union, with
2988one variant for each major type of op. The variable st points to the
2989top-most backtrack struct. To make the code clearer, within each
2990block of code we #define ST to alias the relevant union.
2991
2992Here's a concrete example of a (vastly oversimplified) IFMATCH
2993implementation:
2994
2995 switch (state) {
2996 ....
2997
2998#define ST st->u.ifmatch
2999
3000 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3001 ST.foo = ...; // some state we wish to save
95b24440 3002 ...
bf1f174e
DM
3003 // push a yes backtrack state with a resume value of
3004 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3005 // first node of A:
4d5016e5 3006 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3007 // NOTREACHED
3008
3009 case IFMATCH_A: // we have successfully executed A; now continue with B
3010 next = B;
3011 bar = ST.foo; // do something with the preserved value
3012 break;
3013
3014 case IFMATCH_A_fail: // A failed, so the assertion failed
3015 ...; // do some housekeeping, then ...
3016 sayNO; // propagate the failure
3017
3018#undef ST
95b24440 3019
bf1f174e
DM
3020 ...
3021 }
95b24440 3022
bf1f174e
DM
3023For any old-timers reading this who are familiar with the old recursive
3024approach, the code above is equivalent to:
95b24440 3025
bf1f174e
DM
3026 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3027 {
3028 int foo = ...
95b24440 3029 ...
bf1f174e
DM
3030 if (regmatch(A)) {
3031 next = B;
3032 bar = foo;
3033 break;
95b24440 3034 }
bf1f174e
DM
3035 ...; // do some housekeeping, then ...
3036 sayNO; // propagate the failure
95b24440 3037 }
bf1f174e
DM
3038
3039The topmost backtrack state, pointed to by st, is usually free. If you
3040want to claim it, populate any ST.foo fields in it with values you wish to
3041save, then do one of
3042
4d5016e5
DM
3043 PUSH_STATE_GOTO(resume_state, node, newinput);
3044 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3045
3046which sets that backtrack state's resume value to 'resume_state', pushes a
3047new free entry to the top of the backtrack stack, then goes to 'node'.
3048On backtracking, the free slot is popped, and the saved state becomes the
3049new free state. An ST.foo field in this new top state can be temporarily
3050accessed to retrieve values, but once the main loop is re-entered, it
3051becomes available for reuse.
3052
3053Note that the depth of the backtrack stack constantly increases during the
3054left-to-right execution of the pattern, rather than going up and down with
3055the pattern nesting. For example the stack is at its maximum at Z at the
3056end of the pattern, rather than at X in the following:
3057
3058 /(((X)+)+)+....(Y)+....Z/
3059
3060The only exceptions to this are lookahead/behind assertions and the cut,
3061(?>A), which pop all the backtrack states associated with A before
3062continuing.
3063
486ec47a 3064Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3065PL_regmatch_state and st always point to the currently active state,
3066and PL_regmatch_slab points to the slab currently containing
3067PL_regmatch_state. The first time regmatch() is called, the first slab is
3068allocated, and is never freed until interpreter destruction. When the slab
3069is full, a new one is allocated and chained to the end. At exit from
3070regmatch(), slabs allocated since entry are freed.
3071
3072*/
95b24440 3073
40a82448 3074
5bc10b2c 3075#define DEBUG_STATE_pp(pp) \
265c4333 3076 DEBUG_STATE_r({ \
f2ed9b32 3077 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3078 PerlIO_printf(Perl_debug_log, \
5d458dd8 3079 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3080 depth*2, "", \
13d6edb4 3081 PL_reg_name[st->resume_state], \
5d458dd8
YO
3082 ((st==yes_state||st==mark_state) ? "[" : ""), \
3083 ((st==yes_state) ? "Y" : ""), \
3084 ((st==mark_state) ? "M" : ""), \
3085 ((st==yes_state||st==mark_state) ? "]" : "") \
3086 ); \
265c4333 3087 });
5bc10b2c 3088
40a82448 3089
3dab1dad 3090#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3091
3df15adc 3092#ifdef DEBUGGING
5bc10b2c 3093
ab3bbdeb 3094STATIC void
f2ed9b32 3095S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3096 const char *start, const char *end, const char *blurb)
3097{
efd26800 3098 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3099
3100 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3101
ab3bbdeb
YO
3102 if (!PL_colorset)
3103 reginitcolors();
3104 {
3105 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3106 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3107
f2ed9b32 3108 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3109 start, end - start, 60);
3110
3111 PerlIO_printf(Perl_debug_log,
3112 "%s%s REx%s %s against %s\n",
3113 PL_colors[4], blurb, PL_colors[5], s0, s1);
3114
f2ed9b32 3115 if (utf8_target||utf8_pat)
1de06328
YO
3116 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3117 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3118 utf8_pat && utf8_target ? " and " : "",
3119 utf8_target ? "string" : ""
ab3bbdeb
YO
3120 );
3121 }
3122}
3df15adc
YO
3123
3124STATIC void
786e8c11
YO
3125S_dump_exec_pos(pTHX_ const char *locinput,
3126 const regnode *scan,
3127 const char *loc_regeol,
3128 const char *loc_bostr,
3129 const char *loc_reg_starttry,
f2ed9b32 3130 const bool utf8_target)
07be1b83 3131{
786e8c11 3132 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3133 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3134 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3135 /* The part of the string before starttry has one color
3136 (pref0_len chars), between starttry and current
3137 position another one (pref_len - pref0_len chars),
3138 after the current position the third one.
3139 We assume that pref0_len <= pref_len, otherwise we
3140 decrease pref0_len. */
786e8c11
YO
3141 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3142 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3143 int pref0_len;
3144
7918f24d
NC
3145 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3146
f2ed9b32 3147 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3148 pref_len++;
786e8c11
YO
3149 pref0_len = pref_len - (locinput - loc_reg_starttry);
3150 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3151 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3152 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3153 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3154 l--;
3155 if (pref0_len < 0)
3156 pref0_len = 0;
3157 if (pref0_len > pref_len)
3158 pref0_len = pref_len;
3159 {
f2ed9b32 3160 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3161
ab3bbdeb 3162 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3163 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3164
ab3bbdeb 3165 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3166 (locinput - pref_len + pref0_len),
1de06328 3167 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3168
ab3bbdeb 3169 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3170 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3171
1de06328 3172 const STRLEN tlen=len0+len1+len2;
3df15adc 3173 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3174 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3175 (IV)(locinput - loc_bostr),
07be1b83 3176 len0, s0,
07be1b83 3177 len1, s1,
07be1b83 3178 (docolor ? "" : "> <"),
07be1b83 3179 len2, s2,
f9f4320a 3180 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3181 "");
3182 }
3183}
3df15adc 3184
07be1b83
YO
3185#endif
3186
0a4db386
YO
3187/* reg_check_named_buff_matched()
3188 * Checks to see if a named buffer has matched. The data array of
3189 * buffer numbers corresponding to the buffer is expected to reside
3190 * in the regexp->data->data array in the slot stored in the ARG() of
3191 * node involved. Note that this routine doesn't actually care about the
3192 * name, that information is not preserved from compilation to execution.
3193 * Returns the index of the leftmost defined buffer with the given name
3194 * or 0 if non of the buffers matched.
3195 */
3196STATIC I32
7918f24d
NC
3197S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3198{
0a4db386 3199 I32 n;
f8fc2ecf 3200 RXi_GET_DECL(rex,rexi);
ad64d0ec 3201 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3202 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3203
3204 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3205
0a4db386 3206 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3207 if ((I32)rex->lastparen >= nums[n] &&
3208 rex->offs[nums[n]].end != -1)
0a4db386
YO
3209 {
3210 return nums[n];
3211 }
3212 }
3213 return 0;
3214}
3215
2f554ef7
DM
3216
3217/* free all slabs above current one - called during LEAVE_SCOPE */
3218
3219STATIC void
3220S_clear_backtrack_stack(pTHX_ void *p)
3221{
3222 regmatch_slab *s = PL_regmatch_slab->next;
3223 PERL_UNUSED_ARG(p);
3224
3225 if (!s)
3226 return;
3227 PL_regmatch_slab->next = NULL;
3228 while (s) {
3229 regmatch_slab * const osl = s;
3230 s = s->next;
3231 Safefree(osl);
3232 }
3233}
c74f6de9 3234static bool
984e6dd1
DM
3235S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3236 U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat)
c74f6de9 3237{
79a2a0e8
KW
3238 /* This function determines if there are one or two characters that match
3239 * the first character of the passed-in EXACTish node <text_node>, and if
3240 * so, returns them in the passed-in pointers.
c74f6de9 3241 *
79a2a0e8
KW
3242 * If it determines that no possible character in the target string can
3243 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3244 * the first character in <text_node> requires UTF-8 to represent, and the
3245 * target string isn't in UTF-8.)
c74f6de9 3246 *
79a2a0e8
KW
3247 * If there are more than two characters that could match the beginning of
3248 * <text_node>, or if more context is required to determine a match or not,
3249 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3250 *
3251 * The motiviation behind this function is to allow the caller to set up
3252 * tight loops for matching. If <text_node> is of type EXACT, there is
3253 * only one possible character that can match its first character, and so
3254 * the situation is quite simple. But things get much more complicated if
3255 * folding is involved. It may be that the first character of an EXACTFish
3256 * node doesn't participate in any possible fold, e.g., punctuation, so it
3257 * can be matched only by itself. The vast majority of characters that are
3258 * in folds match just two things, their lower and upper-case equivalents.
3259 * But not all are like that; some have multiple possible matches, or match
3260 * sequences of more than one character. This function sorts all that out.
3261 *
3262 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3263 * loop of trying to match A*, we know we can't exit where the thing
3264 * following it isn't a B. And something can't be a B unless it is the
3265 * beginning of B. By putting a quick test for that beginning in a tight
3266 * loop, we can rule out things that can't possibly be B without having to
3267 * break out of the loop, thus avoiding work. Similarly, if A is a single
3268 * character, we can make a tight loop matching A*, using the outputs of
3269 * this function.
3270 *
3271 * If the target string to match isn't in UTF-8, and there aren't
3272 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3273 * the one or two possible octets (which are characters in this situation)
3274 * that can match. In all cases, if there is only one character that can
3275 * match, *<c1p> and *<c2p> will be identical.
3276 *
3277 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3278 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3279 * can match the beginning of <text_node>. They should be declared with at
3280 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3281 * undefined what these contain.) If one or both of the buffers are
3282 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3283 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3284 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3285 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3286 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9
KW
3287
3288 const bool utf8_target = PL_reg_match_utf8;
79a2a0e8 3289
ddb0d839
KW
3290 UV c1 = CHRTEST_NOT_A_CP_1;
3291 UV c2 = CHRTEST_NOT_A_CP_2;
79a2a0e8
KW
3292 bool use_chrtest_void = FALSE;
3293
3294 /* Used when we have both utf8 input and utf8 output, to avoid converting
3295 * to/from code points */
3296 bool utf8_has_been_setup = FALSE;
3297
c74f6de9
KW
3298 dVAR;
3299
b4291290 3300 U8 *pat = (U8*)STRING(text_node);
c74f6de9 3301
79a2a0e8
KW
3302 if (OP(text_node) == EXACT) {
3303
3304 /* In an exact node, only one thing can be matched, that first
3305 * character. If both the pat and the target are UTF-8, we can just
3306 * copy the input to the output, avoiding finding the code point of
3307 * that character */
984e6dd1 3308 if (!is_utf8_pat) {
79a2a0e8
KW
3309 c2 = c1 = *pat;
3310 }
3311 else if (utf8_target) {
3312 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3313 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3314 utf8_has_been_setup = TRUE;
3315 }
3316 else {
3317 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3318 }
79a2a0e8
KW
3319 }
3320 else /* an EXACTFish node */
984e6dd1 3321 if ((is_utf8_pat
79a2a0e8
KW
3322 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3323 pat + STR_LEN(text_node)))
984e6dd1 3324 || (!is_utf8_pat
79a2a0e8
KW
3325 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3326 pat + STR_LEN(text_node))))
3327 {
3328 /* Multi-character folds require more context to sort out. Also
3329 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3330 * handled outside this routine */
3331 use_chrtest_void = TRUE;
3332 }
3333 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
984e6dd1 3334 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
79a2a0e8
KW
3335 if (c1 > 256) {
3336 /* Load the folds hash, if not already done */
3337 SV** listp;
3338 if (! PL_utf8_foldclosures) {
3339 if (! PL_utf8_tofold) {
3340 U8 dummy[UTF8_MAXBYTES+1];
3341
3342 /* Force loading this by folding an above-Latin1 char */
3343 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3344 assert(PL_utf8_tofold); /* Verify that worked */
3345 }
3346 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3347 }
3348
3349 /* The fold closures data structure is a hash with the keys being
3350 * the UTF-8 of every character that is folded to, like 'k', and
3351 * the values each an array of all code points that fold to its
3352 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3353 * not included */
3354 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3355 (char *) pat,
3356 UTF8SKIP(pat),
3357 FALSE))))
3358 {
3359 /* Not found in the hash, therefore there are no folds
3360 * containing it, so there is only a single character that
3361 * could match */
3362 c2 = c1;
3363 }
3364 else { /* Does participate in folds */
3365 AV* list = (AV*) *listp;
3366 if (av_len(list) != 1) {
3367
3368 /* If there aren't exactly two folds to this, it is outside
3369 * the scope of this function */
3370 use_chrtest_void = TRUE;
3371 }
3372 else { /* There are two. Get them */
3373 SV** c_p = av_fetch(list, 0, FALSE);
3374 if (c_p == NULL) {
3375 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3376 }
3377 c1 = SvUV(*c_p);
3378
3379 c_p = av_fetch(list, 1, FALSE);
3380 if (c_p == NULL) {
3381 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3382 }
3383 c2 = SvUV(*c_p);
3384
3385 /* Folds that cross the 255/256 boundary are forbidden if
3386 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3387 * pattern character is above 256, and its only other match
3388 * is below 256, the only legal match will be to itself.
3389 * We have thrown away the original, so have to compute
3390 * which is the one above 255 */
3391 if ((c1 < 256) != (c2 < 256)) {
3392 if (OP(text_node) == EXACTFL
3393 || (OP(text_node) == EXACTFA
3394 && (isASCII(c1) || isASCII(c2))))
3395 {
3396 if (c1 < 256) {
3397 c1 = c2;
3398 }
3399 else {
3400 c2 = c1;
3401 }
3402 }
3403 }
3404 }
3405 }
3406 }
3407 else /* Here, c1 is < 255 */
3408 if (utf8_target
3409 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3410 && OP(text_node) != EXACTFL
3411 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
c74f6de9
KW
3412 {
3413 /* Here, there could be something above Latin1 in the target which
79a2a0e8
KW
3414 * folds to this character in the pattern. All such cases except
3415 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3416 * involved in their folds, so are outside the scope of this
3417 * function */
3418 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3419 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3420 }
3421 else {
3422 use_chrtest_void = TRUE;
3423 }
c74f6de9
KW
3424 }
3425 else { /* Here nothing above Latin1 can fold to the pattern character */
3426 switch (OP(text_node)) {
3427
3428 case EXACTFL: /* /l rules */
79a2a0e8 3429 c2 = PL_fold_locale[c1];
c74f6de9
KW
3430 break;
3431
3432 case EXACTF:
3433 if (! utf8_target) { /* /d rules */
79a2a0e8 3434 c2 = PL_fold[c1];
c74f6de9
KW
3435 break;
3436 }
3437 /* FALLTHROUGH */
3438 /* /u rules for all these. This happens to work for
79a2a0e8 3439 * EXACTFA as nothing in Latin1 folds to ASCII */
c74f6de9
KW
3440 case EXACTFA:
3441 case EXACTFU_TRICKYFOLD:
79a2a0e8 3442 case EXACTFU_SS:
c74f6de9 3443 case EXACTFU:
79a2a0e8 3444 c2 = PL_fold_latin1[c1];
c74f6de9
KW
3445 break;
3446
878531d3
KW
3447 default:
3448 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3449 assert(0); /* NOTREACHED */
c74f6de9
KW
3450 }
3451 }
3452 }
79a2a0e8
KW
3453
3454 /* Here have figured things out. Set up the returns */
3455 if (use_chrtest_void) {
3456 *c2p = *c1p = CHRTEST_VOID;
3457 }
3458 else if (utf8_target) {
3459 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3460 uvchr_to_utf8(c1_utf8, c1);
3461 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 3462 }
c74f6de9 3463
79a2a0e8
KW
3464 /* Invariants are stored in both the utf8 and byte outputs; Use
3465 * negative numbers otherwise for the byte ones. Make sure that the
3466 * byte ones are the same iff the utf8 ones are the same */
3467 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3468 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3469 ? *c2_utf8
3470 : (c1 == c2)
3471 ? CHRTEST_NOT_A_CP_1
3472 : CHRTEST_NOT_A_CP_2;
3473 }
3474 else if (c1 > 255) {
3475 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3476 can represent */
3477 return FALSE;
3478 }
c74f6de9 3479
79a2a0e8
KW
3480 *c1p = *c2p = c2; /* c2 is the only representable value */
3481 }
3482 else { /* c1 is representable; see about c2 */
3483 *c1p = c1;
3484 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 3485 }
2f554ef7 3486
c74f6de9
KW
3487 return TRUE;
3488}
2f554ef7 3489
f73aaa43
DM
3490/* returns -1 on failure, $+[0] on success */
3491STATIC I32
3492S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 3493{
a35a87e7 3494#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3495 dMY_CXT;
3496#endif
27da23d5 3497 dVAR;
eb578fdb 3498 const bool utf8_target = PL_reg_match_utf8;
4ad0818d 3499 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 3500 REGEXP *rex_sv = reginfo->prog;
8d919b0a 3501 regexp *rex = ReANY(rex_sv);
f8fc2ecf 3502 RXi_GET_DECL(rex,rexi);
2f554ef7 3503 I32 oldsave;
5d9a96ca 3504 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 3505 regmatch_state *st;
5d9a96ca 3506 /* cache heavy used fields of st in registers */
eb578fdb
KW
3507 regnode *scan;
3508 regnode *next;
3509 U32 n = 0; /* general value; init to avoid compiler warning */
3510 I32 ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 3511 char *locinput = startpos;
4d5016e5 3512 char *pushinput; /* where to continue after a PUSH */
eb578fdb 3513 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 3514
b69b0499 3515 bool result = 0; /* return value of S_regmatch */
24d3c4a9 3516 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
3517 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3518 const U32 max_nochange_depth =
3519 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3520 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
3521 regmatch_state *yes_state = NULL; /* state to pop to on success of
3522 subpattern */
e2e6a0f1
YO
3523 /* mark_state piggy backs on the yes_state logic so that when we unwind
3524 the stack on success we can update the mark_state as we go */
3525 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 3526 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 3527 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 3528 U32 state_num;
5d458dd8
YO
3529 bool no_final = 0; /* prevent failure from backtracking? */
3530 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 3531 char *startpoint = locinput;
5d458dd8
YO
3532 SV *popmark = NULL; /* are we looking for a mark? */
3533 SV *sv_commit = NULL; /* last mark name seen in failure */
3534 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 3535 during a successful match */
5d458dd8
YO
3536 U32 lastopen = 0; /* last open we saw */
3537 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
19b95bf0 3538 SV* const oreplsv = GvSV(PL_replgv);
24d3c4a9
DM
3539 /* these three flags are set by various ops to signal information to
3540 * the very next op. They have a useful lifetime of exactly one loop
3541 * iteration, and are not preserved or restored by state pushes/pops
3542 */
3543 bool sw = 0; /* the condition value in (?(cond)a|b) */
3544 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3545 int logical = 0; /* the following EVAL is:
3546 0: (?{...})
3547 1: (?(?{...})X|Y)
3548 2: (??{...})
3549 or the following IFMATCH/UNLESSM is: