This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re_intuit_start(): move label after var decls
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
e1cf74e3
CB
86#ifdef DEBUGGING
87/* At least one required character in the target string is expressible only in
88 * UTF-8. */
89static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
91#endif
92
93#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
95 goto target; \
96} STMT_END
97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
2d66f61e 104/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
3db24e1e 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 115#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad 117#define HOPc(pos,off) \
ba44c216 118 (char *)(reginfo->is_utf8_target \
220db18a 119 ? reghop3((U8*)pos, off, \
9d9163fb 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad
YO
121 : (U8*)(pos + off))
122#define HOPBACKc(pos, off) \
ba44c216 123 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
124 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
125 : (pos - off >= reginfo->strbeg) \
8e11feef 126 ? (U8*)pos - off \
3dab1dad 127 : NULL)
efb30f32 128
ba44c216 129#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 130#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 131
8e9f2289
DM
132/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
133 * off must be >=0; args should be vars rather than expressions */
134#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
135 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
136 : (U8*)((pos + off) > lim ? lim : (pos + off)))
137
2974eaec
DM
138#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
139 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
140 : (U8*)(pos + off))
141#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
142
143#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
144#define NEXTCHR_IS_EOS (nextchr < 0)
145
146#define SET_nextchr \
220db18a 147 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
148
149#define SET_locinput(p) \
150 locinput = (p); \
151 SET_nextchr
152
153
2a16ac92 154#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
155 if (!swash_ptr) { \
156 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 157 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 158 1, 0, invlist, &flags); \
c7304fe2
KW
159 assert(swash_ptr); \
160 } \
161 } STMT_END
162
163/* If in debug mode, we test that a known character properly matches */
164#ifdef DEBUGGING
165# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
166 property_name, \
2a16ac92 167 invlist, \
c7304fe2 168 utf8_char_in_property) \
2a16ac92 169 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
170 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
171#else
172# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
173 property_name, \
2a16ac92 174 invlist, \
c7304fe2 175 utf8_char_in_property) \
2a16ac92 176 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 177#endif
d1eb3177 178
c7304fe2
KW
179#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
180 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
181 "", \
182 PL_XPosix_ptrs[_CC_WORDCHAR], \
df38da56 183 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
c7304fe2
KW
184
185#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
186 STMT_START { \
187 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
188 "_X_regular_begin", \
2a16ac92 189 NULL, \
df38da56 190 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
c7304fe2
KW
191 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
192 "_X_extend", \
2a16ac92 193 NULL, \
c7304fe2
KW
194 COMBINING_GRAVE_ACCENT_UTF8); \
195 } STMT_END
d1eb3177 196
c7304fe2 197#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
198/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
199
5f80c4cf 200/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
201/* it would be nice to rework regcomp.sym to generate this stuff. sigh
202 *
203 * NOTE that *nothing* that affects backtracking should be in here, specifically
204 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
205 * node that is in between two EXACT like nodes when ascertaining what the required
206 * "follow" character is. This should probably be moved to regex compile time
207 * although it may be done at run time beause of the REF possibility - more
208 * investigation required. -- demerphq
209*/
baa60164
KW
210#define JUMPABLE(rn) ( \
211 OP(rn) == OPEN || \
3e901dc0 212 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
213 OP(rn) == EVAL || \
214 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
215 OP(rn) == PLUS || OP(rn) == MINMOD || \
216 OP(rn) == KEEPS || \
217 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 218)
ee9b8eae 219#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 220
ee9b8eae
YO
221#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
222
223#if 0
224/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
225 we don't need this definition. */
226#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 227#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
228#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
229
230#else
231/* ... so we use this as its faster. */
232#define IS_TEXT(rn) ( OP(rn)==EXACT )
098b07d5 233#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
234#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
235#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
236
237#endif
e2d8ce26 238
a84d97b6
HS
239/*
240 Search for mandatory following text node; for lookahead, the text must
241 follow but for lookbehind (rn->flags != 0) we skip to the next step.
242*/
baa60164 243#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
244 while (JUMPABLE(rn)) { \
245 const OPCODE type = OP(rn); \
246 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 247 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 248 else if (type == PLUS) \
cca55fe3 249 rn = NEXTOPER(rn); \
3dab1dad 250 else if (type == IFMATCH) \
a84d97b6 251 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 252 else rn += NEXT_OFF(rn); \
3dab1dad 253 } \
5f80c4cf 254} STMT_END
74750237 255
22913b96
KW
256/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
257 * These are for the pre-composed Hangul syllables, which are all in a
258 * contiguous block and arranged there in such a way so as to facilitate
259 * alorithmic determination of their characteristics. As such, they don't need
260 * a swash, but can be determined by simple arithmetic. Almost all are
261 * GCB=LVT, but every 28th one is a GCB=LV */
262#define SBASE 0xAC00 /* Start of block */
263#define SCount 11172 /* Length of block */
264#define TCount 28
c476f425 265
006f26b2
DM
266#define SLAB_FIRST(s) (&(s)->states[0])
267#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
268
a75351a1 269static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 270static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 271static regmatch_state * S_push_slab(pTHX);
51371543 272
87c0511b 273#define REGCP_PAREN_ELEMS 3
f067efbf 274#define REGCP_OTHER_ELEMS 3
e0fa7e2b 275#define REGCP_FRAME_ELEMS 1
620d5b66
NC
276/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
277 * are needed for the regexp context stack bookkeeping. */
278
76e3520e 279STATIC CHECKPOINT
92da3157 280S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 281{
97aff369 282 dVAR;
a3b680e6 283 const int retval = PL_savestack_ix;
92da3157
DM
284 const int paren_elems_to_push =
285 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
286 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
287 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 288 I32 p;
40a82448 289 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 290
b93070ed
DM
291 PERL_ARGS_ASSERT_REGCPPUSH;
292
e49a9654 293 if (paren_elems_to_push < 0)
5637ef5b
NC
294 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
295 paren_elems_to_push);
e49a9654 296
e0fa7e2b
NC
297 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
298 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 299 " out of range (%lu-%ld)",
92da3157
DM
300 total_elems,
301 (unsigned long)maxopenparen,
302 (long)parenfloor);
e0fa7e2b 303
620d5b66 304 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 305
495f47a5 306 DEBUG_BUFFERS_r(
92da3157 307 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
308 PerlIO_printf(Perl_debug_log,
309 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
310 PTR2UV(rex),
311 PTR2UV(rex->offs)
312 );
313 );
92da3157 314 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 315/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
316 SSPUSHIV(rex->offs[p].end);
317 SSPUSHIV(rex->offs[p].start);
1ca2007e 318 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 319 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
320 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
321 (UV)p,
322 (IV)rex->offs[p].start,
323 (IV)rex->offs[p].start_tmp,
324 (IV)rex->offs[p].end
40a82448 325 ));
a0d0e21e 326 }
b1ce53c5 327/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 328 SSPUSHINT(maxopenparen);
b93070ed
DM
329 SSPUSHINT(rex->lastparen);
330 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 331 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 332
a0d0e21e
LW
333 return retval;
334}
335
c277df42 336/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
337#define REGCP_SET(cp) \
338 DEBUG_STATE_r( \
ab3bbdeb 339 PerlIO_printf(Perl_debug_log, \
e4f74956 340 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
341 (IV)PL_savestack_ix)); \
342 cp = PL_savestack_ix
c3464db5 343
ab3bbdeb 344#define REGCP_UNWIND(cp) \
e4f74956 345 DEBUG_STATE_r( \
ab3bbdeb 346 if (cp != PL_savestack_ix) \
e4f74956
YO
347 PerlIO_printf(Perl_debug_log, \
348 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
349 (IV)(cp), (IV)PL_savestack_ix)); \
350 regcpblow(cp)
c277df42 351
a8d1f4b4
DM
352#define UNWIND_PAREN(lp, lcp) \
353 for (n = rex->lastparen; n > lp; n--) \
354 rex->offs[n].end = -1; \
355 rex->lastparen = n; \
356 rex->lastcloseparen = lcp;
357
358
f067efbf 359STATIC void
92da3157 360S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 361{
97aff369 362 dVAR;
e0fa7e2b 363 UV i;
87c0511b 364 U32 paren;
a3621e74
YO
365 GET_RE_DEBUG_FLAGS_DECL;
366
7918f24d
NC
367 PERL_ARGS_ASSERT_REGCPPOP;
368
b1ce53c5 369 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 370 i = SSPOPUV;
e0fa7e2b
NC
371 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
372 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
373 rex->lastcloseparen = SSPOPINT;
374 rex->lastparen = SSPOPINT;
92da3157 375 *maxopenparen_p = SSPOPINT;
b1ce53c5 376
620d5b66 377 i -= REGCP_OTHER_ELEMS;
b1ce53c5 378 /* Now restore the parentheses context. */
495f47a5
DM
379 DEBUG_BUFFERS_r(
380 if (i || rex->lastparen + 1 <= rex->nparens)
381 PerlIO_printf(Perl_debug_log,
382 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
383 PTR2UV(rex),
384 PTR2UV(rex->offs)
385 );
386 );
92da3157 387 paren = *maxopenparen_p;
620d5b66 388 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 389 SSize_t tmps;
1ca2007e 390 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
391 rex->offs[paren].start = SSPOPIV;
392 tmps = SSPOPIV;
b93070ed
DM
393 if (paren <= rex->lastparen)
394 rex->offs[paren].end = tmps;
495f47a5
DM
395 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
396 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
397 (UV)paren,
398 (IV)rex->offs[paren].start,
399 (IV)rex->offs[paren].start_tmp,
400 (IV)rex->offs[paren].end,
401 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 402 );
87c0511b 403 paren--;
a0d0e21e 404 }
daf18116 405#if 1
dafc8851
JH
406 /* It would seem that the similar code in regtry()
407 * already takes care of this, and in fact it is in
408 * a better location to since this code can #if 0-ed out
409 * but the code in regtry() is needed or otherwise tests
410 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
411 * (as of patchlevel 7877) will fail. Then again,
412 * this code seems to be necessary or otherwise
225593e1
DM
413 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
414 * --jhi updated by dapm */
b93070ed 415 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 416 if (i > *maxopenparen_p)
b93070ed
DM
417 rex->offs[i].start = -1;
418 rex->offs[i].end = -1;
495f47a5
DM
419 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
420 " \\%"UVuf": %s ..-1 undeffing\n",
421 (UV)i,
92da3157 422 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 423 ));
a0d0e21e 424 }
dafc8851 425#endif
a0d0e21e
LW
426}
427
74088413
DM
428/* restore the parens and associated vars at savestack position ix,
429 * but without popping the stack */
430
431STATIC void
92da3157 432S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
433{
434 I32 tmpix = PL_savestack_ix;
435 PL_savestack_ix = ix;
92da3157 436 regcppop(rex, maxopenparen_p);
74088413
DM
437 PL_savestack_ix = tmpix;
438}
439
02db2b7b 440#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 441
31c7f561
KW
442STATIC bool
443S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
444{
445 /* Returns a boolean as to whether or not 'character' is a member of the
446 * Posix character class given by 'classnum' that should be equivalent to a
447 * value in the typedef '_char_class_number'.
448 *
449 * Ideally this could be replaced by a just an array of function pointers
450 * to the C library functions that implement the macros this calls.
451 * However, to compile, the precise function signatures are required, and
452 * these may vary from platform to to platform. To avoid having to figure
453 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
454 * which adds an extra layer of function call overhead (unless the C
455 * optimizer strips it away). But we don't particularly care about
456 * performance with locales anyway. */
31c7f561
KW
457
458 switch ((_char_class_number) classnum) {
15861f94 459 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 460 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
461 case _CC_ENUM_ASCII: return isASCII_LC(character);
462 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
463 case _CC_ENUM_CASED: return isLOWER_LC(character)
464 || isUPPER_LC(character);
e8d596e0 465 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
466 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
467 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
468 case _CC_ENUM_LOWER: return isLOWER_LC(character);
469 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 470 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 471 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 472 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
473 case _CC_ENUM_UPPER: return isUPPER_LC(character);
474 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 475 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
476 default: /* VERTSPACE should never occur in locales */
477 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
478 }
479
480 assert(0); /* NOTREACHED */
481 return FALSE;
482}
483
3018b823
KW
484STATIC bool
485S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
486{
487 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
488 * 'character' is a member of the Posix character class given by 'classnum'
489 * that should be equivalent to a value in the typedef
490 * '_char_class_number'.
491 *
492 * This just calls isFOO_lc on the code point for the character if it is in
493 * the range 0-255. Outside that range, all characters avoid Unicode
494 * rules, ignoring any locale. So use the Unicode function if this class
495 * requires a swash, and use the Unicode macro otherwise. */
496
497 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
498
499 if (UTF8_IS_INVARIANT(*character)) {
500 return isFOO_lc(classnum, *character);
501 }
502 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
503 return isFOO_lc(classnum,
94bb8c36 504 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
505 }
506
507 if (classnum < _FIRST_NON_SWASH_CC) {
508
509 /* Initialize the swash unless done already */
510 if (! PL_utf8_swash_ptrs[classnum]) {
511 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
512 PL_utf8_swash_ptrs[classnum] =
513 _core_swash_init("utf8",
514 "",
515 &PL_sv_undef, 1, 0,
516 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
517 }
518
92a2046b
KW
519 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
520 character,
521 TRUE /* is UTF */ ));
3018b823
KW
522 }
523
524 switch ((_char_class_number) classnum) {
525 case _CC_ENUM_SPACE:
526 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
527
528 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
529 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
530 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
531 default: return 0; /* Things like CNTRL are always
532 below 256 */
533 }
534
535 assert(0); /* NOTREACHED */
536 return FALSE;
537}
538
a687059c 539/*
e50aee73 540 * pregexec and friends
a687059c
LW
541 */
542
76234dfb 543#ifndef PERL_IN_XSUB_RE
a687059c 544/*
c277df42 545 - pregexec - match a regexp against a string
a687059c 546 */
c277df42 547I32
5aaab254 548Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 549 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
550/* stringarg: the point in the string at which to begin matching */
551/* strend: pointer to null at end of string */
552/* strbeg: real beginning of string */
553/* minend: end of match must be >= minend bytes after stringarg. */
554/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
555 * itself is accessed via the pointers above */
556/* nosave: For optimizations. */
c277df42 557{
7918f24d
NC
558 PERL_ARGS_ASSERT_PREGEXEC;
559
c277df42 560 return
9041c2e3 561 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
562 nosave ? 0 : REXEC_COPY_STR);
563}
76234dfb 564#endif
22e551b9 565
9041c2e3 566/*
cad2e5aa
JH
567 * Need to implement the following flags for reg_anch:
568 *
569 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
570 * USE_INTUIT_ML
571 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
572 * INTUIT_AUTORITATIVE_ML
573 * INTUIT_ONCE_NOML - Intuit can match in one location only.
574 * INTUIT_ONCE_ML
575 *
576 * Another flag for this function: SECOND_TIME (so that float substrs
577 * with giant delta may be not rechecked).
578 */
579
3f7c398e 580/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
581 Otherwise, only SvCUR(sv) is used to get strbeg. */
582
6eb5f6b9
JH
583/* XXXX Some places assume that there is a fixed substring.
584 An update may be needed if optimizer marks as "INTUITable"
585 RExen without fixed substrings. Similarly, it is assumed that
586 lengths of all the strings are no more than minlen, thus they
587 cannot come from lookahead.
40d049e4
YO
588 (Or minlen should take into account lookahead.)
589 NOTE: Some of this comment is not correct. minlen does now take account
590 of lookahead/behind. Further research is required. -- demerphq
591
592*/
6eb5f6b9 593
2c2d71f5
JH
594/* A failure to find a constant substring means that there is no need to make
595 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 596 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
597 regtry() should be needed.
598
599 REx compiler's optimizer found 4 possible hints:
600 a) Anchored substring;
601 b) Fixed substring;
602 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 603 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 604 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
605 string which does not contradict any of them.
606 */
2c2d71f5 607
6eb5f6b9
JH
608/* Most of decisions we do here should have been done at compile time.
609 The nodes of the REx which we used for the search should have been
610 deleted from the finite automaton. */
611
52a21eb3
DM
612/* args:
613 * rx: the regex to match against
614 * sv: the SV being matched: only used for utf8 flag; the string
615 * itself is accessed via the pointers below. Note that on
616 * something like an overloaded SV, SvPOK(sv) may be false
617 * and the string pointers may point to something unrelated to
618 * the SV itself.
619 * strbeg: real beginning of string
620 * strpos: the point in the string at which to begin matching
621 * strend: pointer to the byte following the last char of the string
622 * flags currently unused; set to 0
623 * data: currently unused; set to NULL
624 */
625
cad2e5aa 626char *
52a21eb3
DM
627Perl_re_intuit_start(pTHX_
628 REGEXP * const rx,
629 SV *sv,
630 const char * const strbeg,
631 char *strpos,
632 char *strend,
633 const U32 flags,
634 re_scream_pos_data *data)
cad2e5aa 635{
97aff369 636 dVAR;
8d919b0a 637 struct regexp *const prog = ReANY(rx);
49f55535 638 SSize_t start_shift = 0;
cad2e5aa 639 /* Should be nonnegative! */
ea3daa5d 640 SSize_t end_shift = 0;
0fc004dd
DM
641 /* current lowest pos in string where the regex can start matching */
642 char *rx_origin = strpos;
eb578fdb 643 SV *check;
f2ed9b32 644 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 645 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 646 bool ml_anch = 0;
8f4bf5fc 647 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 648 char *check_at = NULL; /* check substr found at this pos */
d8080198 649 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 650 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 651 RXi_GET_DECL(prog,progi);
02d5137b
DM
652 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
653 regmatch_info *const reginfo = &reginfo_buf;
30944b6d 654#ifdef DEBUGGING
890ce7af 655 const char * const i_strpos = strpos;
30944b6d 656#endif
a3621e74
YO
657 GET_RE_DEBUG_FLAGS_DECL;
658
7918f24d 659 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
660 PERL_UNUSED_ARG(flags);
661 PERL_UNUSED_ARG(data);
7918f24d 662
1dc475d0
DM
663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
664 "Intuit: trying to determine minimum start position...\n"));
665
fb9bbddb
DM
666 /* for now, assume that all substr offsets are positive. If at some point
667 * in the future someone wants to do clever things with look-behind and
668 * -ve offsets, they'll need to fix up any code in this function
669 * which uses these offsets. See the thread beginning
670 * <20140113145929.GF27210@iabyn.com>
671 */
672 assert(prog->substrs->data[0].min_offset >= 0);
673 assert(prog->substrs->data[0].max_offset >= 0);
674 assert(prog->substrs->data[1].min_offset >= 0);
675 assert(prog->substrs->data[1].max_offset >= 0);
676 assert(prog->substrs->data[2].min_offset >= 0);
677 assert(prog->substrs->data[2].max_offset >= 0);
678
c344f387
JH
679 /* CHR_DIST() would be more correct here but it makes things slow. */
680 if (prog->minlen > strend - strpos) {
a3621e74 681 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 682 " String too short...\n"));
cad2e5aa 683 goto fail;
2c2d71f5 684 }
d8da0584 685
6c3fea77 686 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 687 reginfo->info_aux = NULL;
9d9163fb 688 reginfo->strbeg = strbeg;
220db18a 689 reginfo->strend = strend;
aed7b151 690 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 691 reginfo->intuit = 1;
1cb48e53
DM
692 /* not actually used within intuit, but zero for safety anyway */
693 reginfo->poscache_maxiter = 0;
02d5137b 694
f2ed9b32 695 if (utf8_target) {
33b8afdf
JH
696 if (!prog->check_utf8 && prog->check_substr)
697 to_utf8_substr(prog);
698 check = prog->check_utf8;
699 } else {
7e0d5ad7
KW
700 if (!prog->check_substr && prog->check_utf8) {
701 if (! to_byte_substr(prog)) {
6b54ddc5 702 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
703 }
704 }
33b8afdf
JH
705 check = prog->check_substr;
706 }
274cd312 707
1dc475d0
DM
708 /* dump the various substring data */
709 DEBUG_OPTIMISE_MORE_r({
710 int i;
711 for (i=0; i<=2; i++) {
712 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
713 : prog->substrs->data[i].substr);
714 if (!sv)
715 continue;
716
717 PerlIO_printf(Perl_debug_log,
718 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
719 " useful=%"IVdf" utf8=%d [%s]\n",
720 i,
721 (IV)prog->substrs->data[i].min_offset,
722 (IV)prog->substrs->data[i].max_offset,
723 (IV)prog->substrs->data[i].end_shift,
724 BmUSEFUL(sv),
725 utf8_target ? 1 : 0,
726 SvPEEK(sv));
727 }
728 });
729
8e1490ee 730 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
274cd312
DM
731 /* Check after \n? */
732 ml_anch = ( (prog->intflags & PREGf_ANCH_MBOL)
733 || ((prog->intflags & PREGf_ANCH_BOL) && multiline));
cad2e5aa 734
7e25d62c 735 if (!ml_anch) {
c889ccc8
DM
736 /* we are only allowed to match at BOS or \G */
737
57fcbfa7
DM
738 /* trivially reject if there's a BOS anchor and we're not at BOS.
739 * In the case of \G, we hope(!) that the caller has already
740 * set strpos to pos()-gofs, and will already have checked
741 * that this anchor position is legal. So we can skip it here.
742 */
743 if ( !(prog->intflags & PREGf_ANCH_GPOS)
744 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
745 && (strpos != strbeg))
c889ccc8
DM
746 {
747 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 748 " Not at start...\n"));
c889ccc8
DM
749 goto fail;
750 }
751
a5d12a4b
DM
752 /* in the presence of an anchor, the anchored (relative to the
753 * start of the regex) substr must also be anchored relative
754 * to strpos. So quickly reject if substr isn't found there */
755
c889ccc8
DM
756 if (prog->check_offset_min == prog->check_offset_max
757 && !(prog->intflags & PREGf_CANY_SEEN)
758 && ! multiline) /* /m can cause \n's to match that aren't
759 accounted for in the string max length.
760 See [perl #115242] */
761 {
762 /* Substring at constant offset from beg-of-str... */
d307bf57 763 SSize_t slen = SvCUR(check);
7ef63983 764 char *s;
c889ccc8
DM
765
766 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 767
1dc475d0
DM
768 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
769 " Looking for check substr at fixed offset %"IVdf"...\n",
770 (IV)prog->check_offset_min));
771
c889ccc8 772 if (SvTAIL(check)) {
d307bf57
DM
773 /* In this case, the regex is anchored at the end too,
774 * so the lengths must match exactly, give or take a \n.
775 * NB: slen >= 1 since the last char of check is \n */
c889ccc8
DM
776 if ( strend - s > slen || strend - s < slen - 1
777 || (strend - s == slen && strend[-1] != '\n'))
778 {
779 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 780 " String too long...\n"));
c889ccc8
DM
781 goto fail_finish;
782 }
783 /* Now should match s[0..slen-2] */
784 slen--;
c889ccc8 785 }
d307bf57
DM
786 if (slen && (*SvPVX_const(check) != *s
787 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
788 {
789 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 790 " String not equal...\n"));
d307bf57
DM
791 goto fail_finish;
792 }
c889ccc8
DM
793
794 check_at = s;
795 goto success_at_start;
cad2e5aa 796 }
cad2e5aa 797 }
cad2e5aa 798 }
0fc004dd 799
c0e0ec46
DM
800 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
801 end_shift = prog->check_end_shift;
cad2e5aa 802
19188028 803#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 804 if (end_shift < 0)
1de06328 805 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 806 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
807#endif
808
2c2d71f5 809 restart:
0fc004dd
DM
810 /* Find a candidate regex origin in the region rx_origin..strend
811 * by looking for the "check" substring in that region, corrected by
812 * start/end_shift.
813 */
1de06328
YO
814
815 {
c33e64f0
FC
816 U8* start_point;
817 U8* end_point;
c889ccc8 818
c889ccc8 819 DEBUG_OPTIMISE_MORE_r({
1dc475d0 820 PerlIO_printf(Perl_debug_log,
ae5d4331 821 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 822 " Start shift: %"IVdf" End shift %"IVdf
4d281893 823 " Real end Shift: %"IVdf"\n",
0fc004dd 824 (IV)(rx_origin - i_strpos),
c889ccc8 825 (IV)prog->check_offset_min,
12fbc530
DM
826 (IV)start_shift,
827 (IV)end_shift,
c889ccc8
DM
828 (IV)prog->check_end_shift);
829 });
1de06328 830
0d331aaf 831 if (prog->intflags & PREGf_CANY_SEEN) {
0fc004dd 832 start_point= (U8*)(rx_origin + start_shift);
12fbc530 833 end_point= (U8*)(strend - end_shift);
1de06328 834 } else {
0fc004dd 835 start_point= HOP3(rx_origin, start_shift, strend);
12fbc530 836 end_point= HOP3(strend, -end_shift, strbeg);
1de06328 837 }
c889ccc8 838
77656d5b
DM
839 /* if the regex is absolutely anchored to the start of the string,
840 * then check_offset_max represents an upper bound on the string
841 * where the substr could start */
c19c836a
DM
842 if (!ml_anch
843 && prog->intflags & PREGf_ANCH
77656d5b
DM
844 && prog->check_offset_max != SSize_t_MAX
845 && start_shift < prog->check_offset_max)
c19c836a 846 {
1a08ba3a
DM
847 SSize_t len = SvCUR(check) - !!SvTAIL(check);
848 end_point = HOP3lim(start_point,
849 prog->check_offset_max - start_shift,
850 end_point -len)
851 + len;
d6ef1678
DM
852 }
853
6bda09f9 854 DEBUG_OPTIMISE_MORE_r({
1dc475d0 855 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
1de06328 856 (int)(end_point - start_point),
fc8cd66c 857 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
858 start_point);
859 });
860
ae5d4331 861 check_at = fbm_instr( start_point, end_point,
7fba1cd6 862 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 863 }
c889ccc8 864
cad2e5aa
JH
865 /* Update the count-of-usability, remove useless subpatterns,
866 unshift s. */
2c2d71f5 867
ab3bbdeb 868 DEBUG_EXECUTE_r({
f2ed9b32 869 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb 870 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1dc475d0 871 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
ae5d4331 872 (check_at ? "Found" : "Did not find"),
f2ed9b32 873 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
874 ? "anchored" : "floating"),
875 quoted,
876 RE_SV_TAIL(check),
ae5d4331 877 (check_at ? " at offset " : "...\n") );
ab3bbdeb 878 });
2c2d71f5 879
ae5d4331 880 if (!check_at)
2c2d71f5 881 goto fail_finish;
2c2d71f5 882 /* Finish the diagnostic message */
ae5d4331 883 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - i_strpos)) );
2c2d71f5 884
fdc003fd 885 /* set rx_origin to the minimum position where the regex could start
b96863b8
DM
886 * matching, given the constraint of the just-matched check substring.
887 * But don't set it lower than previously.
fdc003fd
DM
888 */
889
b96863b8
DM
890 if (check_at - rx_origin > prog->check_offset_max)
891 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
fdc003fd
DM
892
893
1de06328
YO
894 /* XXX dmq: first branch is for positive lookbehind...
895 Our check string is offset from the beginning of the pattern.
896 So we need to do any stclass tests offset forward from that
897 point. I think. :-(
898 */
899
2c2d71f5
JH
900 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
901 Start with the other substr.
902 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 903 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
904 *always* match. Probably should be marked during compile...
905 Probably it is right to do no SCREAM here...
906 */
907
6480a6c4
DM
908 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
909 : prog->substrs->data[other_ix].substr)
1de06328 910 {
30944b6d 911 /* Take into account the "other" substring. */
2d52c850 912 char *last, *last1;
f20d3862 913 char *s;
2d52c850 914 SV* must;
07544f89
DM
915 struct reg_substr_datum *other;
916
917 do_other_substr:
918 other = &prog->substrs->data[other_ix];
2d52c850 919
ead917d0 920 /* if "other" is anchored:
1c1c599d 921 * we've previously found a floating substr starting at check_at.
2d52c850 922 * This means that the regex origin must lie somewhere
1c1c599d
DM
923 * between min (rx_origin): HOP3(check_at, -check_offset_max)
924 * and max: HOP3(check_at, -check_offset_min)
2d52c850
DM
925 * (except that min will be >= strpos)
926 * So the fixed substr must lie somewhere between
927 * HOP3(min, anchored_offset)
928 * HOP3(max, anchored_offset) + SvCUR(substr)
929 */
97c6112e 930
ead917d0
DM
931 /* if "other" is floating
932 * Calculate last1, the absolute latest point where the
4d281893
DM
933 * floating substr could start in the string, ignoring any
934 * constraints from the earlier fixed match. It is calculated
935 * as follows:
936 *
937 * strend - prog->minlen (in chars) is the absolute latest
938 * position within the string where the origin of the regex
939 * could appear. The latest start point for the floating
940 * substr is float_min_offset(*) on from the start of the
941 * regex. last1 simply combines thee two offsets.
942 *
943 * (*) You might think the latest start point should be
944 * float_max_offset from the regex origin, and technically
945 * you'd be correct. However, consider
946 * /a\d{2,4}bcd\w/
947 * Here, float min, max are 3,5 and minlen is 7.
948 * This can match either
949 * /a\d\dbcd\w/
950 * /a\d\d\dbcd\w/
951 * /a\d\d\d\dbcd\w/
952 * In the first case, the regex matches minlen chars; in the
953 * second, minlen+1, in the third, minlen+2.
954 * In the first case, the floating offset is 3 (which equals
955 * float_min), in the second, 4, and in the third, 5 (which
956 * equals float_max). In all cases, the floating string bcd
957 * can never start more than 4 chars from the end of the
958 * string, which equals minlen - float_min. As the substring
959 * starts to match more than float_min from the start of the
960 * regex, it makes the regex match more than minlen chars,
961 * and the two cancel each other out. So we can always use
962 * float_min - minlen, rather than float_max - minlen for the
963 * latest position in the string.
964 *
965 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
eb3831ce
DM
966 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
967 */
ead917d0
DM
968
969 if (!other_ix)
970 assert(prog->minlen > other->min_offset);
971
972 last1 = HOP3c(strend,
973 other->min_offset - prog->minlen, strbeg);
974
975 if (other_ix) {
976 /* last is the latest point where the floating substr could
977 * start, *given* any constraints from the earlier fixed
978 * match. This constraint is that the floating string starts
979 * <= float_max_offset chars from the regex origin (rx_origin).
980 * If this value is less than last1, use it instead.
981 */
982 assert(rx_origin <= last1);
983 last =
984 /* this condition handles the offset==infinity case, and
985 * is a short-cut otherwise. Although it's comparing a
986 * byte offset to a char length, it does so in a safe way,
987 * since 1 char always occupies 1 or more bytes,
988 * so if a string range is (last1 - rx_origin) bytes,
989 * it will be less than or equal to (last1 - rx_origin)
990 * chars; meaning it errs towards doing the accurate HOP3
991 * rather than just using last1 as a short-cut */
992 (last1 - rx_origin) < other->max_offset
993 ? last1
994 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
995 }
996 else {
1c1c599d
DM
997 assert(strpos + start_shift <= check_at);
998 last = HOP4c(check_at, other->min_offset - start_shift,
ead917d0
DM
999 strbeg, strend);
1000 }
1001
1002 s = HOP3c(rx_origin, other->min_offset, strend);
1003 if (s < other_last) /* These positions already checked */
1004 s = other_last;
e9321f50 1005
826b9972 1006 must = utf8_target ? other->utf8_substr : other->substr;
9ed4de72 1007 assert(SvPOK(must));
ead917d0
DM
1008 s = fbm_instr(
1009 (unsigned char*)s,
1010 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1011 must,
1012 multiline ? FBMrf_MULTILINE : 0
1013 );
1014 DEBUG_EXECUTE_r({
1015 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1016 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1017 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1018 s ? "Found" : "Contradicts",
1019 other_ix ? "floating" : "anchored",
1020 quoted, RE_SV_TAIL(must));
ab3bbdeb 1021 });
e9321f50 1022
ead917d0
DM
1023
1024 if (!s) {
1025 /* last1 is latest possible substr location. If we didn't
e9321f50 1026 * find it before there, we never will */
ead917d0
DM
1027 if (last >= last1) {
1028 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1029 ", giving up...\n"));
1030 goto fail_finish;
1031 }
1032
1033 /* try to find the check substr again at a later
1034 * position. Maybe next time we'll find the "other" substr
1035 * in range too */
1036 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1037 ", trying %s at offset %ld...\n",
1038 (other_ix ? "floating" : "anchored"),
1c1c599d 1039 (long)(HOP3c(check_at, 1, strend) - i_strpos)));
ead917d0
DM
1040
1041 other_last = HOP3c(last, 1, strend) /* highest failure */;
a6566820
DM
1042 rx_origin =
1043 other_ix
1044 ? HOP3c(rx_origin, 1, strend)
1045 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
ead917d0
DM
1046 goto restart;
1047 }
1048 else {
1049 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1050 (long)(s - i_strpos)));
1051
1052 if (other_ix) {
1053 /* other_last is set to s, not s+1, since its possible for
1054 * a floating substr to fail first time, then succeed
1055 * second time at the same floating position; e.g.:
1056 * "-AB--AABZ" =~ /\wAB\d*Z/
1057 * The first time round, anchored and float match at
1058 * "-(AB)--AAB(Z)" then fail on the initial \w character
1059 * class. Second time round, they match at "-AB--A(AB)(Z)".
1060 */
1061 other_last = s;
1062 }
1063 else {
1064 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1065 other_last = HOP3c(s, 1, strend);
1066 }
ead917d0 1067 }
cad2e5aa 1068 }
acba93e8
DM
1069 else {
1070 DEBUG_OPTIMISE_MORE_r(
1071 PerlIO_printf(Perl_debug_log,
1072 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1073 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
acba93e8
DM
1074 " strend-strpos:%"IVdf"\n",
1075 (IV)prog->check_offset_min,
1076 (IV)prog->check_offset_max,
1c1c599d 1077 (IV)(check_at-strpos),
acba93e8 1078 (IV)(rx_origin-strpos),
1c1c599d 1079 (IV)(rx_origin-check_at),
acba93e8
DM
1080 (IV)(strend-strpos)
1081 )
1082 );
1083 }
2c2d71f5 1084
acba93e8 1085 postprocess_substr_matches:
0991020e 1086
e3c6feb0
DM
1087 /* handle the extra constraint of /^/m */
1088
a71a26b9
DM
1089 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n'
1090 /* May be due to an implicit anchor of m{.*foo} */
1091 && !(prog->intflags & PREGf_IMPLICIT))
e3c6feb0 1092 {
a62659bd
DM
1093 char *t;
1094
1095 /* Eventually fbm_*() should handle this, but often
1096 anchored_offset is not 0, so this check will not be wasted. */
1097 /* XXXX In the code below we prefer to look for "^" even in
1098 presence of anchored substrings. And we search even
1099 beyond the found float position. These pessimizations
1100 are historical artefacts only. */
1101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1102 " looking for /^/m anchor"));
1103 t = rx_origin;
1104 while (t < strend - prog->minlen) {
1105 if (*t == '\n') {
1106 if (t < check_at - prog->check_offset_min) {
1107 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1108 /* Since we moved from the found position,
1109 we definitely contradict the found anchored
1110 substr. Due to the above check we do not
1111 contradict "check" substr.
1112 Thus we can arrive here only if check substr
1113 is float. Redo checking for "other"=="fixed".
1114 */
1115 rx_origin = strpos = t + 1;
1116 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1117 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1118 assert(prog->substrs->check_ix); /* other is float */
1119 goto do_other_substr;
1120 }
1121 /* We don't contradict the found floating substring. */
1122 /* XXXX Why not check for STCLASS? */
c765d6e0 1123 rx_origin = t + 1;
a62659bd 1124 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " Found /%s^%s/m at offset %ld...\n",
c765d6e0 1125 PL_colors[0], PL_colors[1], (long)(rx_origin - i_strpos)));
a62659bd
DM
1126 break; /* success: found anchor */
1127 }
1128 /* Position contradicts check-string */
1129 /* XXXX probably better to look for check-string
1130 than for "\n", so one should lower the limit for t? */
1131 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1132 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1133 other_last = strpos = rx_origin = t + 1;
1134 goto restart;
e3c6feb0 1135 }
a62659bd
DM
1136 t++;
1137 }
1138 if (t >= strend - prog->minlen) {
1139 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " Did not find /%s^%s/m...\n",
1140 PL_colors[0], PL_colors[1]));
1141 goto fail_finish;
1142 }
e3c6feb0
DM
1143 }
1144 else {
1145 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " Starting position does not contradict /%s^%s/m...\n",
1146 PL_colors[0], PL_colors[1]));
1147 }
1148
1149
1150 /* Decide whether using the substrings helped */
1151
1152 if (rx_origin != strpos) {
1153 /* Fixed substring is found far enough so that the match
1154 cannot start at strpos. */
1155
1156 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
f2ed9b32 1157 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1158 }
1159 else {
f5952150 1160 /* The found string does not prohibit matching at strpos,
2c2d71f5 1161 - no optimization of calling REx engine can be performed,
f5952150
GS
1162 unless it was an MBOL and we are not after MBOL,
1163 or a future STCLASS check will fail this. */
2c2d71f5 1164 success_at_start:
bbe252da 1165 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1166 && (utf8_target ? (
33b8afdf
JH
1167 prog->check_utf8 /* Could be deleted already */
1168 && --BmUSEFUL(prog->check_utf8) < 0
1169 && (prog->check_utf8 == prog->float_utf8)
1170 ) : (
1171 prog->check_substr /* Could be deleted already */
1172 && --BmUSEFUL(prog->check_substr) < 0
1173 && (prog->check_substr == prog->float_substr)
1174 )))
66e933ab 1175 {
cad2e5aa 1176 /* If flags & SOMETHING - do not do it many times on the same match */
1dc475d0 1177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
f2ed9b32
KW
1178 /* XXX Does the destruction order has to change with utf8_target? */
1179 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1180 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1181 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1182 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1183 check = NULL; /* abort */
486ec47a 1184 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951 1185 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
8e1490ee
YO
1186 if (prog->intflags & PREGf_IMPLICIT) {
1187 prog->intflags &= ~PREGf_ANCH_MBOL;
1188 /* maybe we have no anchors left after this... */
1189 if (!(prog->intflags & PREGf_ANCH))
1190 prog->extflags &= ~RXf_IS_ANCHORED;
1191 }
3cf5c195
IZ
1192 /* XXXX This is a remnant of the old implementation. It
1193 looks wasteful, since now INTUIT can use many
6eb5f6b9 1194 other heuristics. */
bbe252da 1195 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1196 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa 1197 }
cad2e5aa
JH
1198 }
1199
6eb5f6b9
JH
1200 /* Last resort... */
1201 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1202 /* trie stclasses are too expensive to use here, we are better off to
1203 leave it to regmatch itself */
f8fc2ecf 1204 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1205 /* minlen == 0 is possible if regstclass is \b or \B,
1206 and the fixed substr is ''$.
c765d6e0
DM
1207 Since minlen is already taken into account, rx_origin+1 is before strend;
1208 accidentally, minlen >= 1 guaranties no false positives at rx_origin + 1
6eb5f6b9
JH
1209 even for \b or \B. But (minlen? 1 : 0) below assumes that
1210 regstclass does not come from lookahead... */
1211 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1212 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf 1213 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e
DM
1214 char *t;
1215
2c75e362 1216 /* XXX this value could be pre-computed */
f8fc2ecf 1217 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1218 ? (reginfo->is_utf8_pat
1219 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1220 : STR_LEN(progi->regstclass))
66e933ab 1221 : 1);
1de06328 1222 char * endpos;
efb9848c 1223 char *s = rx_origin;
1de06328
YO
1224 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1225 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1226 else if (prog->float_substr || prog->float_utf8)
1227 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1228 else
1229 endpos= strend;
1230
d8080198
YO
1231 if (checked_upto < s)
1232 checked_upto = s;
1dc475d0
DM
1233 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1234 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
1235 " s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1236 (IV)start_shift, (IV)(check_at - strbeg),
1237 (IV)(s - strbeg), (IV)(endpos - strbeg),
1238 (IV)(checked_upto- strbeg)));
d8080198 1239
6eb5f6b9 1240 t = s;
984e6dd1 1241 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
f9176b44 1242 reginfo);
d8080198
YO
1243 if (s) {
1244 checked_upto = s;
1245 } else {
6eb5f6b9 1246#ifdef DEBUGGING
cbbf8932 1247 const char *what = NULL;
6eb5f6b9
JH
1248#endif
1249 if (endpos == strend) {
a3621e74 1250 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1251 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1252 goto fail;
1253 }
a3621e74 1254 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1255 " This position contradicts STCLASS...\n") );
8e1490ee 1256 if ((prog->intflags & PREGf_ANCH) && !ml_anch)
653099ff 1257 goto fail;
d8080198 1258 checked_upto = HOPBACKc(endpos, start_shift);
1dc475d0 1259 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
d8080198 1260 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1261 /* Contradict one of substrings */
33b8afdf 1262 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1263 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1264 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1265 hop_and_restart:
1aa99e6b 1266 s = HOP3c(t, 1, strend);
66e933ab
GS
1267 if (s + start_shift + end_shift > strend) {
1268 /* XXXX Should be taken into account earlier? */
a3621e74 1269 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1270 " Could not match STCLASS...\n") );
66e933ab
GS
1271 goto fail;
1272 }
c765d6e0 1273 rx_origin = s;
5e39e1e5
HS
1274 if (!check)
1275 goto giveup;
a3621e74 1276 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1277 " Looking for %s substr starting at offset %ld...\n",
0fc004dd 1278 what, (long)(rx_origin + start_shift - i_strpos)) );
6eb5f6b9
JH
1279 goto restart;
1280 }
66e933ab 1281 /* Have both, check_string is floating */
6eb5f6b9
JH
1282 if (t + start_shift >= check_at) /* Contradicts floating=check */
1283 goto retry_floating_check;
1284 /* Recheck anchored substring, but not floating... */
c765d6e0
DM
1285 if (!check) {
1286 rx_origin = NULL;
5e39e1e5 1287 goto giveup;
c765d6e0 1288 }
a3621e74 1289 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1290 " Looking for anchored substr starting at offset %ld...\n",
6eb5f6b9 1291 (long)(other_last - i_strpos)) );
c347b528
DM
1292 assert(prog->substrs->check_ix); /* other is float */
1293 goto do_other_substr;
6eb5f6b9 1294 }
60e71179
GS
1295 /* Another way we could have checked stclass at the
1296 current position only: */
1297 if (ml_anch) {
3043e7b4 1298 s = rx_origin = t + 1;
5e39e1e5
HS
1299 if (!check)
1300 goto giveup;
a3621e74 1301 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1302 " Looking for /%s^%s/m starting at offset %ld...\n",
3043e7b4
DM
1303 PL_colors[0], PL_colors[1],
1304 (long)(rx_origin - i_strpos)) );
acba93e8
DM
1305 /* XXX DAPM I don't yet know why this is true, but the code
1306 * assumed it when it used to do goto try_at_offset */
1307 assert(rx_origin != strpos);
1308 goto postprocess_substr_matches;
66e933ab 1309 }
f2ed9b32 1310 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1311 goto fail;
486ec47a 1312 /* Check is floating substring. */
6eb5f6b9
JH
1313 retry_floating_check:
1314 t = check_at - start_shift;
a3621e74 1315 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1316 goto hop_and_restart;
1317 }
b7953727 1318 if (t != s) {
a3621e74 1319 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1320 " By STCLASS: moving %ld --> %ld\n",
b7953727
JH
1321 (long)(t - i_strpos), (long)(s - i_strpos))
1322 );
1323 }
1324 else {
a3621e74 1325 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1326 " Does not contradict STCLASS...\n");
b7953727
JH
1327 );
1328 }
6eb5f6b9 1329 }
5e39e1e5 1330 giveup:
1dc475d0
DM
1331 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Intuit: %s%s:%s match at offset %ld\n",
1332 PL_colors[4], (check ? "Successfully guessed" : "Giving up"),
c765d6e0
DM
1333 PL_colors[5], (long)(rx_origin - i_strpos)) );
1334 return rx_origin;
2c2d71f5
JH
1335
1336 fail_finish: /* Substring not found */
33b8afdf 1337 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1338 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1339 fail:
a3621e74 1340 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1341 PL_colors[4], PL_colors[5]));
bd61b366 1342 return NULL;
cad2e5aa 1343}
9661b544 1344
a0a388a1 1345#define DECL_TRIE_TYPE(scan) \
098b07d5
KW
1346 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1347 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
fab2782b
YO
1348 trie_type = ((scan->flags == EXACT) \
1349 ? (utf8_target ? trie_utf8 : trie_plain) \
098b07d5
KW
1350 : (scan->flags == EXACTFA) \
1351 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1352 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
fab2782b 1353
fd3249ee 1354#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1355STMT_START { \
fab2782b 1356 STRLEN skiplen; \
baa60164 1357 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1358 switch (trie_type) { \
31f05a37 1359 case trie_utf8_exactfa_fold: \
baa60164
KW
1360 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1361 /* FALL THROUGH */ \
fab2782b
YO
1362 case trie_utf8_fold: \
1363 if ( foldlen>0 ) { \
c80e42f3 1364 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1365 foldlen -= len; \
1366 uscan += len; \
1367 len=0; \
1368 } else { \
445bf929 1369 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1370 len = UTF8SKIP(uc); \
1371 skiplen = UNISKIP( uvc ); \
1372 foldlen -= skiplen; \
1373 uscan = foldbuf + skiplen; \
1374 } \
1375 break; \
baa60164
KW
1376 case trie_latin_utf8_exactfa_fold: \
1377 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1378 /* FALL THROUGH */ \
fab2782b
YO
1379 case trie_latin_utf8_fold: \
1380 if ( foldlen>0 ) { \
c80e42f3 1381 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1382 foldlen -= len; \
1383 uscan += len; \
1384 len=0; \
1385 } else { \
1386 len = 1; \
31f05a37 1387 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1388 skiplen = UNISKIP( uvc ); \
1389 foldlen -= skiplen; \
1390 uscan = foldbuf + skiplen; \
1391 } \
1392 break; \
1393 case trie_utf8: \
c80e42f3 1394 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1395 break; \
1396 case trie_plain: \
1397 uvc = (UV)*uc; \
1398 len = 1; \
1399 } \
1400 if (uvc < 256) { \
1401 charid = trie->charmap[ uvc ]; \
1402 } \
1403 else { \
1404 charid = 0; \
1405 if (widecharmap) { \
1406 SV** const svpp = hv_fetch(widecharmap, \
1407 (char*)&uvc, sizeof(UV), 0); \
1408 if (svpp) \
1409 charid = (U16)SvIV(*svpp); \
1410 } \
1411 } \
4cadc6a9
YO
1412} STMT_END
1413
4cadc6a9
YO
1414#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1415STMT_START { \
1416 while (s <= e) { \
1417 if ( (CoNd) \
fac1af77 1418 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1419 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1420 goto got_it; \
1421 s++; \
1422 } \
1423} STMT_END
1424
1425#define REXEC_FBC_UTF8_SCAN(CoDe) \
1426STMT_START { \
9a902117 1427 while (s < strend) { \
4cadc6a9 1428 CoDe \
9a902117 1429 s += UTF8SKIP(s); \
4cadc6a9
YO
1430 } \
1431} STMT_END
1432
1433#define REXEC_FBC_SCAN(CoDe) \
1434STMT_START { \
1435 while (s < strend) { \
1436 CoDe \
1437 s++; \
1438 } \
1439} STMT_END
1440
1441#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1442REXEC_FBC_UTF8_SCAN( \
1443 if (CoNd) { \
02d5137b 1444 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1445 goto got_it; \
1446 else \
1447 tmp = doevery; \
1448 } \
1449 else \
1450 tmp = 1; \
1451)
1452
1453#define REXEC_FBC_CLASS_SCAN(CoNd) \
1454REXEC_FBC_SCAN( \
1455 if (CoNd) { \
02d5137b 1456 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1457 goto got_it; \
1458 else \
1459 tmp = doevery; \
1460 } \
1461 else \
1462 tmp = 1; \
1463)
1464
baa60164 1465#define REXEC_FBC_TRYIT \
02d5137b 1466if ((reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1467 goto got_it
1468
e1d1eefb 1469#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
baa60164 1470 if (utf8_target) { \
e1d1eefb
YO
1471 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1472 } \
1473 else { \
1474 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1475 }
e1d1eefb 1476
baa60164 1477#define DUMP_EXEC_POS(li,s,doutf8) \
9d9163fb 1478 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
6d59b646 1479 startpos, doutf8)
786e8c11 1480
cfaf538b 1481
baa60164 1482#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1483 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
cfaf538b
KW
1484 tmp = TEST_NON_UTF8(tmp); \
1485 REXEC_FBC_UTF8_SCAN( \
ce1d15d5 1486 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
cfaf538b
KW
1487 tmp = !tmp; \
1488 IF_SUCCESS; \
1489 } \
1490 else { \
1491 IF_FAIL; \
1492 } \
1493 ); \
1494
baa60164 1495#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1496 if (s == reginfo->strbeg) { \
cfaf538b
KW
1497 tmp = '\n'; \
1498 } \
1499 else { \
9d9163fb 1500 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
3db24e1e
KW
1501 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1502 0, UTF8_ALLOW_DEFAULT); \
cfaf538b
KW
1503 } \
1504 tmp = TeSt1_UtF8; \
3db24e1e 1505 LOAD_UTF8_CHARCLASS_ALNUM(); \
cfaf538b 1506 REXEC_FBC_UTF8_SCAN( \
3db24e1e 1507 if (tmp == ! (TeSt2_UtF8)) { \
cfaf538b
KW
1508 tmp = !tmp; \
1509 IF_SUCCESS; \
1510 } \
1511 else { \
1512 IF_FAIL; \
1513 } \
1514 ); \
1515
63ac0dad
KW
1516/* The only difference between the BOUND and NBOUND cases is that
1517 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1518 * NBOUND. This is accomplished by passing it in either the if or else clause,
1519 * with the other one being empty */
1520#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1521 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1522
1523#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1524 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1525
1526#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1527 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1528
1529#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1530 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1531
63ac0dad
KW
1532
1533/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1534 * be passed in completely with the variable name being tested, which isn't
1535 * such a clean interface, but this is easier to read than it was before. We
1536 * are looking for the boundary (or non-boundary between a word and non-word
1537 * character. The utf8 and non-utf8 cases have the same logic, but the details
1538 * must be different. Find the "wordness" of the character just prior to this
1539 * one, and compare it with the wordness of this one. If they differ, we have
1540 * a boundary. At the beginning of the string, pretend that the previous
1541 * character was a new-line */
baa60164 1542#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1543 if (utf8_target) { \
baa60164 1544 UTF8_CODE \
63ac0dad
KW
1545 } \
1546 else { /* Not utf8 */ \
9d9163fb 1547 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad
KW
1548 tmp = TEST_NON_UTF8(tmp); \
1549 REXEC_FBC_SCAN( \
1550 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1551 tmp = !tmp; \
1552 IF_SUCCESS; \
1553 } \
1554 else { \
1555 IF_FAIL; \
1556 } \
1557 ); \
1558 } \
baa60164 1559 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
63ac0dad
KW
1560 goto got_it;
1561
786e8c11 1562/* We know what class REx starts with. Try to find this position... */
02d5137b 1563/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1564/* annoyingly all the vars in this routine have different names from their counterparts
1565 in regmatch. /grrr */
1566
3c3eec57 1567STATIC char *
07be1b83 1568S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1569 const char *strend, regmatch_info *reginfo)
a687059c 1570{
73104a1b
KW
1571 dVAR;
1572 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1573 char *pat_string; /* The pattern's exactish string */
1574 char *pat_end; /* ptr to end char of pat_string */
1575 re_fold_t folder; /* Function for computing non-utf8 folds */
1576 const U8 *fold_array; /* array for folding ords < 256 */
1577 STRLEN ln;
1578 STRLEN lnc;
73104a1b
KW
1579 U8 c1;
1580 U8 c2;
1581 char *e;
1582 I32 tmp = 1; /* Scratch variable? */
ba44c216 1583 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1584 UV utf8_fold_flags = 0;
f9176b44 1585 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1586 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1587 with a result inverts that result, as 0^1 =
1588 1 and 1^1 = 0 */
1589 _char_class_number classnum;
1590
73104a1b 1591 RXi_GET_DECL(prog,progi);
2f7f8cb1 1592
73104a1b 1593 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1594
73104a1b
KW
1595 /* We know what class it must start with. */
1596 switch (OP(c)) {
1597 case ANYOF:
1598 if (utf8_target) {
1599 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1600 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1601 }
1602 else {
1603 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1604 }
1605 break;
1606 case CANY:
1607 REXEC_FBC_SCAN(
02d5137b 1608 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1609 goto got_it;
1610 else
1611 tmp = doevery;
1612 );
1613 break;
1614
098b07d5
KW
1615 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1616 assert(! is_utf8_pat);
1617 /* FALL THROUGH */
73104a1b 1618 case EXACTFA:
984e6dd1 1619 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1620 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1621 goto do_exactf_utf8;
1622 }
1623 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1624 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1625 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1626
2fdb7295
KW
1627 case EXACTF: /* This node only generated for non-utf8 patterns */
1628 assert(! is_utf8_pat);
73104a1b 1629 if (utf8_target) {
73104a1b
KW
1630 utf8_fold_flags = 0;
1631 goto do_exactf_utf8;
1632 }
1633 fold_array = PL_fold;
1634 folder = foldEQ;
1635 goto do_exactf_non_utf8;
1636
1637 case EXACTFL:
31f05a37 1638 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1639 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1640 goto do_exactf_utf8;
1641 }
1642 fold_array = PL_fold_locale;
1643 folder = foldEQ_locale;
1644 goto do_exactf_non_utf8;
3c760661 1645
73104a1b 1646 case EXACTFU_SS:
984e6dd1 1647 if (is_utf8_pat) {
73104a1b
KW
1648 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1649 }
1650 goto do_exactf_utf8;
16d951b7 1651
73104a1b 1652 case EXACTFU:
984e6dd1
DM
1653 if (is_utf8_pat || utf8_target) {
1654 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1655 goto do_exactf_utf8;
1656 }
fac1af77 1657
73104a1b
KW
1658 /* Any 'ss' in the pattern should have been replaced by regcomp,
1659 * so we don't have to worry here about this single special case
1660 * in the Latin1 range */
1661 fold_array = PL_fold_latin1;
1662 folder = foldEQ_latin1;
1663
1664 /* FALL THROUGH */
1665
1666 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1667 are no glitches with fold-length differences
1668 between the target string and pattern */
1669
1670 /* The idea in the non-utf8 EXACTF* cases is to first find the
1671 * first character of the EXACTF* node and then, if necessary,
1672 * case-insensitively compare the full text of the node. c1 is the
1673 * first character. c2 is its fold. This logic will not work for
1674 * Unicode semantics and the german sharp ss, which hence should
1675 * not be compiled into a node that gets here. */
1676 pat_string = STRING(c);
1677 ln = STR_LEN(c); /* length to match in octets/bytes */
1678
1679 /* We know that we have to match at least 'ln' bytes (which is the
1680 * same as characters, since not utf8). If we have to match 3
1681 * characters, and there are only 2 availabe, we know without
1682 * trying that it will fail; so don't start a match past the
1683 * required minimum number from the far end */
ea3daa5d 1684 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1685
02d5137b 1686 if (reginfo->intuit && e < s) {
73104a1b
KW
1687 e = s; /* Due to minlen logic of intuit() */
1688 }
fac1af77 1689
73104a1b
KW
1690 c1 = *pat_string;
1691 c2 = fold_array[c1];
1692 if (c1 == c2) { /* If char and fold are the same */
1693 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1694 }
1695 else {
1696 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1697 }
1698 break;
fac1af77 1699
73104a1b
KW
1700 do_exactf_utf8:
1701 {
1702 unsigned expansion;
1703
1704 /* If one of the operands is in utf8, we can't use the simpler folding
1705 * above, due to the fact that many different characters can have the
1706 * same fold, or portion of a fold, or different- length fold */
1707 pat_string = STRING(c);
1708 ln = STR_LEN(c); /* length to match in octets/bytes */
1709 pat_end = pat_string + ln;
984e6dd1 1710 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1711 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1712 : ln;
1713
1714 /* We have 'lnc' characters to match in the pattern, but because of
1715 * multi-character folding, each character in the target can match
1716 * up to 3 characters (Unicode guarantees it will never exceed
1717 * this) if it is utf8-encoded; and up to 2 if not (based on the
1718 * fact that the Latin 1 folds are already determined, and the
1719 * only multi-char fold in that range is the sharp-s folding to
1720 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1721 * string character. Adjust lnc accordingly, rounding up, so that
1722 * if we need to match at least 4+1/3 chars, that really is 5. */
1723 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1724 lnc = (lnc + expansion - 1) / expansion;
1725
1726 /* As in the non-UTF8 case, if we have to match 3 characters, and
1727 * only 2 are left, it's guaranteed to fail, so don't start a
1728 * match that would require us to go beyond the end of the string
1729 */
ea3daa5d 1730 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1731
02d5137b 1732 if (reginfo->intuit && e < s) {
73104a1b
KW
1733 e = s; /* Due to minlen logic of intuit() */
1734 }
0658cdde 1735
73104a1b
KW
1736 /* XXX Note that we could recalculate e to stop the loop earlier,
1737 * as the worst case expansion above will rarely be met, and as we
1738 * go along we would usually find that e moves further to the left.
1739 * This would happen only after we reached the point in the loop
1740 * where if there were no expansion we should fail. Unclear if
1741 * worth the expense */
1742
1743 while (s <= e) {
1744 char *my_strend= (char *)strend;
1745 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1746 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1747 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1748 {
1749 goto got_it;
1750 }
1751 s += (utf8_target) ? UTF8SKIP(s) : 1;
1752 }
1753 break;
1754 }
1755 case BOUNDL:
272d35c9 1756 RXp_MATCH_TAINTED_on(prog);
0eb30aeb 1757 FBC_BOUND(isWORDCHAR_LC,
f4cd282c 1758 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1759 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1760 break;
1761 case NBOUNDL:
272d35c9 1762 RXp_MATCH_TAINTED_on(prog);
0eb30aeb 1763 FBC_NBOUND(isWORDCHAR_LC,
f4cd282c 1764 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1765 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1766 break;
1767 case BOUND:
1768 FBC_BOUND(isWORDCHAR,
0eb30aeb 1769 isWORDCHAR_uni(tmp),
03940dc2 1770 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1771 break;
1772 case BOUNDA:
1773 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1774 isWORDCHAR_A(tmp),
1775 isWORDCHAR_A((U8*)s));
1776 break;
1777 case NBOUND:
1778 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1779 isWORDCHAR_uni(tmp),
03940dc2 1780 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1781 break;
1782 case NBOUNDA:
1783 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1784 isWORDCHAR_A(tmp),
1785 isWORDCHAR_A((U8*)s));
1786 break;
1787 case BOUNDU:
1788 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1789 isWORDCHAR_uni(tmp),
03940dc2 1790 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1791 break;
1792 case NBOUNDU:
1793 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1794 isWORDCHAR_uni(tmp),
03940dc2 1795 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1796 break;
73104a1b
KW
1797 case LNBREAK:
1798 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1799 is_LNBREAK_latin1_safe(s, strend)
1800 );
1801 break;
3018b823
KW
1802
1803 /* The argument to all the POSIX node types is the class number to pass to
1804 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1805
1806 case NPOSIXL:
1807 to_complement = 1;
1808 /* FALLTHROUGH */
1809
1810 case POSIXL:
272d35c9 1811 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1812 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1813 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1814 break;
3018b823
KW
1815
1816 case NPOSIXD:
1817 to_complement = 1;
1818 /* FALLTHROUGH */
1819
1820 case POSIXD:
1821 if (utf8_target) {
1822 goto posix_utf8;
1823 }
1824 goto posixa;
1825
1826 case NPOSIXA:
1827 if (utf8_target) {
1828 /* The complement of something that matches only ASCII matches all
1829 * UTF-8 variant code points, plus everything in ASCII that isn't
1830 * in the class */
1831 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1832 || ! _generic_isCC_A(*s, FLAGS(c)));
1833 break;
1834 }
1835
1836 to_complement = 1;
1837 /* FALLTHROUGH */
1838
73104a1b 1839 case POSIXA:
3018b823 1840 posixa:
73104a1b 1841 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1842 * byte invariant character. */
1843 REXEC_FBC_CLASS_SCAN(
1844 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1845 break;
3018b823
KW
1846
1847 case NPOSIXU:
1848 to_complement = 1;
1849 /* FALLTHROUGH */
1850
1851 case POSIXU:
1852 if (! utf8_target) {
1853 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1854 FLAGS(c))));
1855 }
1856 else {
1857
1858 posix_utf8:
1859 classnum = (_char_class_number) FLAGS(c);
1860 if (classnum < _FIRST_NON_SWASH_CC) {
1861 while (s < strend) {
1862
1863 /* We avoid loading in the swash as long as possible, but
1864 * should we have to, we jump to a separate loop. This
1865 * extra 'if' statement is what keeps this code from being
1866 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1867 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1868 goto found_above_latin1;
1869 }
1870 if ((UTF8_IS_INVARIANT(*s)
1871 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1872 classnum)))
1873 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1874 && to_complement ^ cBOOL(
94bb8c36
KW
1875 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1876 *(s + 1)),
3018b823
KW
1877 classnum))))
1878 {
02d5137b 1879 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
1880 goto got_it;
1881 else {
1882 tmp = doevery;
1883 }
1884 }
1885 else {
1886 tmp = 1;
1887 }
1888 s += UTF8SKIP(s);
1889 }
1890 }
1891 else switch (classnum) { /* These classes are implemented as
1892 macros */
1893 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1894 revert the change of \v matching this */
1895 /* FALL THROUGH */
1896
1897 case _CC_ENUM_PSXSPC:
1898 REXEC_FBC_UTF8_CLASS_SCAN(
1899 to_complement ^ cBOOL(isSPACE_utf8(s)));
1900 break;
1901
1902 case _CC_ENUM_BLANK:
1903 REXEC_FBC_UTF8_CLASS_SCAN(
1904 to_complement ^ cBOOL(isBLANK_utf8(s)));
1905 break;
1906
1907 case _CC_ENUM_XDIGIT:
1908 REXEC_FBC_UTF8_CLASS_SCAN(
1909 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1910 break;
1911
1912 case _CC_ENUM_VERTSPACE:
1913 REXEC_FBC_UTF8_CLASS_SCAN(
1914 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1915 break;
1916
1917 case _CC_ENUM_CNTRL:
1918 REXEC_FBC_UTF8_CLASS_SCAN(
1919 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1920 break;
1921
1922 default:
1923 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1924 assert(0); /* NOTREACHED */
1925 }
1926 }
1927 break;
1928
1929 found_above_latin1: /* Here we have to load a swash to get the result
1930 for the current code point */
1931 if (! PL_utf8_swash_ptrs[classnum]) {
1932 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1933 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
1934 _core_swash_init("utf8",
1935 "",
1936 &PL_sv_undef, 1, 0,
1937 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
1938 }
1939
1940 /* This is a copy of the loop above for swash classes, though using the
1941 * FBC macro instead of being expanded out. Since we've loaded the
1942 * swash, we don't have to check for that each time through the loop */
1943 REXEC_FBC_UTF8_CLASS_SCAN(
1944 to_complement ^ cBOOL(_generic_utf8(
1945 classnum,
1946 s,
1947 swash_fetch(PL_utf8_swash_ptrs[classnum],
1948 (U8 *) s, TRUE))));
73104a1b
KW
1949 break;
1950
1951 case AHOCORASICKC:
1952 case AHOCORASICK:
1953 {
1954 DECL_TRIE_TYPE(c);
1955 /* what trie are we using right now */
1956 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1957 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1958 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1959
1960 const char *last_start = strend - trie->minlen;
6148ee25 1961#ifdef DEBUGGING
73104a1b 1962 const char *real_start = s;
6148ee25 1963#endif
73104a1b
KW
1964 STRLEN maxlen = trie->maxlen;
1965 SV *sv_points;
1966 U8 **points; /* map of where we were in the input string
1967 when reading a given char. For ASCII this
1968 is unnecessary overhead as the relationship
1969 is always 1:1, but for Unicode, especially
1970 case folded Unicode this is not true. */
1971 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1972 U8 *bitmap=NULL;
1973
1974
1975 GET_RE_DEBUG_FLAGS_DECL;
1976
1977 /* We can't just allocate points here. We need to wrap it in
1978 * an SV so it gets freed properly if there is a croak while
1979 * running the match */
1980 ENTER;
1981 SAVETMPS;
1982 sv_points=newSV(maxlen * sizeof(U8 *));
1983 SvCUR_set(sv_points,
1984 maxlen * sizeof(U8 *));
1985 SvPOK_on(sv_points);
1986 sv_2mortal(sv_points);
1987 points=(U8**)SvPV_nolen(sv_points );
1988 if ( trie_type != trie_utf8_fold
1989 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1990 {
1991 if (trie->bitmap)
1992 bitmap=(U8*)trie->bitmap;
1993 else
1994 bitmap=(U8*)ANYOF_BITMAP(c);
1995 }
1996 /* this is the Aho-Corasick algorithm modified a touch
1997 to include special handling for long "unknown char" sequences.
1998 The basic idea being that we use AC as long as we are dealing
1999 with a possible matching char, when we encounter an unknown char
2000 (and we have not encountered an accepting state) we scan forward
2001 until we find a legal starting char.
2002 AC matching is basically that of trie matching, except that when
2003 we encounter a failing transition, we fall back to the current
2004 states "fail state", and try the current char again, a process
2005 we repeat until we reach the root state, state 1, or a legal
2006 transition. If we fail on the root state then we can either
2007 terminate if we have reached an accepting state previously, or
2008 restart the entire process from the beginning if we have not.
2009
2010 */
2011 while (s <= last_start) {
2012 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2013 U8 *uc = (U8*)s;
2014 U16 charid = 0;
2015 U32 base = 1;
2016 U32 state = 1;
2017 UV uvc = 0;
2018 STRLEN len = 0;
2019 STRLEN foldlen = 0;
2020 U8 *uscan = (U8*)NULL;
2021 U8 *leftmost = NULL;
2022#ifdef DEBUGGING
2023 U32 accepted_word= 0;
786e8c11 2024#endif
73104a1b
KW
2025 U32 pointpos = 0;
2026
2027 while ( state && uc <= (U8*)strend ) {
2028 int failed=0;
2029 U32 word = aho->states[ state ].wordnum;
2030
2031 if( state==1 ) {
2032 if ( bitmap ) {
2033 DEBUG_TRIE_EXECUTE_r(
2034 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2035 dump_exec_pos( (char *)uc, c, strend, real_start,
2036 (char *)uc, utf8_target );
2037 PerlIO_printf( Perl_debug_log,
2038 " Scanning for legal start char...\n");
2039 }
2040 );
2041 if (utf8_target) {
2042 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2043 uc += UTF8SKIP(uc);
2044 }
2045 } else {
2046 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2047 uc++;
2048 }
786e8c11 2049 }
73104a1b 2050 s= (char *)uc;
07be1b83 2051 }
73104a1b
KW
2052 if (uc >(U8*)last_start) break;
2053 }
2054
2055 if ( word ) {
2056 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2057 if (!leftmost || lpos < leftmost) {
2058 DEBUG_r(accepted_word=word);
2059 leftmost= lpos;
7016d6eb 2060 }
73104a1b 2061 if (base==0) break;
7016d6eb 2062
73104a1b
KW
2063 }
2064 points[pointpos++ % maxlen]= uc;
2065 if (foldlen || uc < (U8*)strend) {
2066 REXEC_TRIE_READ_CHAR(trie_type, trie,
2067 widecharmap, uc,
2068 uscan, len, uvc, charid, foldlen,
2069 foldbuf, uniflags);
2070 DEBUG_TRIE_EXECUTE_r({
2071 dump_exec_pos( (char *)uc, c, strend,
2072 real_start, s, utf8_target);
2073 PerlIO_printf(Perl_debug_log,
2074 " Charid:%3u CP:%4"UVxf" ",
2075 charid, uvc);
2076 });
2077 }
2078 else {
2079 len = 0;
2080 charid = 0;
2081 }
07be1b83 2082
73104a1b
KW
2083
2084 do {
6148ee25 2085#ifdef DEBUGGING
73104a1b 2086 word = aho->states[ state ].wordnum;
6148ee25 2087#endif
73104a1b
KW
2088 base = aho->states[ state ].trans.base;
2089
2090 DEBUG_TRIE_EXECUTE_r({
2091 if (failed)
2092 dump_exec_pos( (char *)uc, c, strend, real_start,
2093 s, utf8_target );
2094 PerlIO_printf( Perl_debug_log,
2095 "%sState: %4"UVxf", word=%"UVxf,
2096 failed ? " Fail transition to " : "",
2097 (UV)state, (UV)word);
2098 });
2099 if ( base ) {
2100 U32 tmp;
2101 I32 offset;
2102 if (charid &&
2103 ( ((offset = base + charid
2104 - 1 - trie->uniquecharcount)) >= 0)
2105 && ((U32)offset < trie->lasttrans)
2106 && trie->trans[offset].check == state
2107 && (tmp=trie->trans[offset].next))
2108 {
2109 DEBUG_TRIE_EXECUTE_r(
2110 PerlIO_printf( Perl_debug_log," - legal\n"));
2111 state = tmp;
2112 break;
07be1b83
YO
2113 }
2114 else {
786e8c11 2115 DEBUG_TRIE_EXECUTE_r(
73104a1b 2116 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2117 failed = 1;
73104a1b 2118 state = aho->fail[state];
07be1b83 2119 }
07be1b83 2120 }
73104a1b
KW
2121 else {
2122 /* we must be accepting here */
2123 DEBUG_TRIE_EXECUTE_r(
2124 PerlIO_printf( Perl_debug_log," - accepting\n"));
2125 failed = 1;
2126 break;
786e8c11 2127 }
73104a1b
KW
2128 } while(state);
2129 uc += len;
2130 if (failed) {
2131 if (leftmost)
2132 break;
2133 if (!state) state = 1;
07be1b83 2134 }
73104a1b
KW
2135 }
2136 if ( aho->states[ state ].wordnum ) {
2137 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2138 if (!leftmost || lpos < leftmost) {
2139 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2140 leftmost = lpos;
07be1b83
YO
2141 }
2142 }
73104a1b
KW
2143 if (leftmost) {
2144 s = (char*)leftmost;
2145 DEBUG_TRIE_EXECUTE_r({
2146 PerlIO_printf(
2147 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2148 (UV)accepted_word, (IV)(s - real_start)
2149 );
2150 });
02d5137b 2151 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2152 FREETMPS;
2153 LEAVE;
2154 goto got_it;
2155 }
2156 s = HOPc(s,1);
2157 DEBUG_TRIE_EXECUTE_r({
2158 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2159 });
2160 } else {
2161 DEBUG_TRIE_EXECUTE_r(
2162 PerlIO_printf( Perl_debug_log,"No match.\n"));
2163 break;
2164 }
2165 }
2166 FREETMPS;
2167 LEAVE;
2168 }
2169 break;
2170 default:
2171 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2172 break;
2173 }
2174 return 0;
2175 got_it:
2176 return s;
6eb5f6b9
JH
2177}
2178
60165aa4
DM
2179/* set RX_SAVED_COPY, RX_SUBBEG etc.
2180 * flags have same meanings as with regexec_flags() */
2181
749f4950
DM
2182static void
2183S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2184 char *strbeg,
2185 char *strend,
2186 SV *sv,
2187 U32 flags,
2188 bool utf8_target)
2189{
2190 struct regexp *const prog = ReANY(rx);
2191
60165aa4
DM
2192 if (flags & REXEC_COPY_STR) {
2193#ifdef PERL_ANY_COW
2194 if (SvCANCOW(sv)) {
2195 if (DEBUG_C_TEST) {
2196 PerlIO_printf(Perl_debug_log,
2197 "Copy on write: regexp capture, type %d\n",
2198 (int) SvTYPE(sv));
2199 }
5411a0e5
DM
2200 /* Create a new COW SV to share the match string and store
2201 * in saved_copy, unless the current COW SV in saved_copy
2202 * is valid and suitable for our purpose */
2203 if (( prog->saved_copy
2204 && SvIsCOW(prog->saved_copy)
2205 && SvPOKp(prog->saved_copy)
2206 && SvIsCOW(sv)
2207 && SvPOKp(sv)
2208 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2209 {
5411a0e5
DM
2210 /* just reuse saved_copy SV */
2211 if (RXp_MATCH_COPIED(prog)) {
2212 Safefree(prog->subbeg);
2213 RXp_MATCH_COPIED_off(prog);
2214 }
2215 }
2216 else {
2217 /* create new COW SV to share string */
a76b0e90
DM
2218 RX_MATCH_COPY_FREE(rx);
2219 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2220 }
5411a0e5
DM
2221 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2222 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2223 prog->sublen = strend - strbeg;
2224 prog->suboffset = 0;
2225 prog->subcoffset = 0;
2226 } else
2227#endif
2228 {
99a90e59
FC
2229 SSize_t min = 0;
2230 SSize_t max = strend - strbeg;
ea3daa5d 2231 SSize_t sublen;
60165aa4
DM
2232
2233 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2234 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2235 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2236 ) { /* don't copy $' part of string */
2237 U32 n = 0;
2238 max = -1;
2239 /* calculate the right-most part of the string covered
2240 * by a capture. Due to look-ahead, this may be to
2241 * the right of $&, so we have to scan all captures */
2242 while (n <= prog->lastparen) {
2243 if (prog->offs[n].end > max)
2244 max = prog->offs[n].end;
2245 n++;
2246 }
2247 if (max == -1)
2248 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2249 ? prog->offs[0].start
2250 : 0;
2251 assert(max >= 0 && max <= strend - strbeg);
2252 }
2253
2254 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2255 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2256 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2257 ) { /* don't copy $` part of string */
2258 U32 n = 0;
2259 min = max;
2260 /* calculate the left-most part of the string covered
2261 * by a capture. Due to look-behind, this may be to
2262 * the left of $&, so we have to scan all captures */
2263 while (min && n <= prog->lastparen) {
2264 if ( prog->offs[n].start != -1
2265 && prog->offs[n].start < min)
2266 {
2267 min = prog->offs[n].start;
2268 }
2269 n++;
2270 }
2271 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2272 && min > prog->offs[0].end
2273 )
2274 min = prog->offs[0].end;
2275
2276 }
2277
2278 assert(min >= 0 && min <= max && min <= strend - strbeg);
2279 sublen = max - min;
2280
2281 if (RX_MATCH_COPIED(rx)) {
2282 if (sublen > prog->sublen)
2283 prog->subbeg =
2284 (char*)saferealloc(prog->subbeg, sublen+1);
2285 }
2286 else
2287 prog->subbeg = (char*)safemalloc(sublen+1);
2288 Copy(strbeg + min, prog->subbeg, sublen, char);
2289 prog->subbeg[sublen] = '\0';
2290 prog->suboffset = min;
2291 prog->sublen = sublen;
2292 RX_MATCH_COPIED_on(rx);
2293 }
2294 prog->subcoffset = prog->suboffset;
2295 if (prog->suboffset && utf8_target) {
2296 /* Convert byte offset to chars.
2297 * XXX ideally should only compute this if @-/@+
2298 * has been seen, a la PL_sawampersand ??? */
2299
2300 /* If there's a direct correspondence between the
2301 * string which we're matching and the original SV,
2302 * then we can use the utf8 len cache associated with
2303 * the SV. In particular, it means that under //g,
2304 * sv_pos_b2u() will use the previously cached
2305 * position to speed up working out the new length of
2306 * subcoffset, rather than counting from the start of
2307 * the string each time. This stops
2308 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2309 * from going quadratic */
2310 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2311 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2312 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2313 else
2314 prog->subcoffset = utf8_length((U8*)strbeg,
2315 (U8*)(strbeg+prog->suboffset));
2316 }
2317 }
2318 else {
2319 RX_MATCH_COPY_FREE(rx);
2320 prog->subbeg = strbeg;
2321 prog->suboffset = 0;
2322 prog->subcoffset = 0;
2323 prog->sublen = strend - strbeg;
2324 }
2325}
2326
2327
2328
fae667d5 2329
6eb5f6b9
JH
2330/*
2331 - regexec_flags - match a regexp against a string
2332 */
2333I32
5aaab254 2334Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2335 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2336/* stringarg: the point in the string at which to begin matching */
2337/* strend: pointer to null at end of string */
2338/* strbeg: real beginning of string */
2339/* minend: end of match must be >= minend bytes after stringarg. */
2340/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2341 * itself is accessed via the pointers above */
2342/* data: May be used for some additional optimizations.
d058ec57 2343 Currently unused. */
a340edde 2344/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2345
6eb5f6b9 2346{
97aff369 2347 dVAR;
8d919b0a 2348 struct regexp *const prog = ReANY(rx);
5aaab254 2349 char *s;
eb578fdb 2350 regnode *c;
03c83e26 2351 char *startpos;
ea3daa5d
FC
2352 SSize_t minlen; /* must match at least this many chars */
2353 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2354 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2355 I32 multiline;
f8fc2ecf 2356 RXi_GET_DECL(prog,progi);
02d5137b
DM
2357 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2358 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2359 regexp_paren_pair *swap = NULL;
006f26b2 2360 I32 oldsave;
a3621e74
YO
2361 GET_RE_DEBUG_FLAGS_DECL;
2362
7918f24d 2363 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2364 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2365
2366 /* Be paranoid... */
03c83e26 2367 if (prog == NULL || stringarg == NULL) {
6eb5f6b9
JH
2368 Perl_croak(aTHX_ "NULL regexp parameter");
2369 return 0;
2370 }
2371
6c3fea77 2372 DEBUG_EXECUTE_r(
03c83e26 2373 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2374 "Matching");
2375 );
8adc0f72 2376
b342a604
DM
2377 startpos = stringarg;
2378
58430ea8 2379 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2380 MAGIC *mg;
2381
fef7148b
DM
2382 /* set reginfo->ganch, the position where \G can match */
2383
2384 reginfo->ganch =
2385 (flags & REXEC_IGNOREPOS)
2386 ? stringarg /* use start pos rather than pos() */
2387 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2388 /* Defined pos(): */
2389 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2390 : strbeg; /* pos() not defined; use start of string */
2391
2392 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2393 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2394
03c83e26
DM
2395 /* in the presence of \G, we may need to start looking earlier in
2396 * the string than the suggested start point of stringarg:
0b2c2a84 2397 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2398 * offset, such as
2399 * /..\G/: gofs = 2
2400 * /ab|c\G/: gofs = 1
2401 * or if the minimum offset isn't known, then we have to go back
2402 * to the start of the string, e.g. /w+\G/
2403 */
2bfbe302 2404
8e1490ee 2405 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2406 startpos = reginfo->ganch - prog->gofs;
2407 if (startpos <
2408 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2409 {
2410 DEBUG_r(PerlIO_printf(Perl_debug_log,
2411 "fail: ganch-gofs before earliest possible start\n"));
2412 return 0;
2413 }
2414 }
2415 else if (prog->gofs) {
b342a604
DM
2416 if (startpos - prog->gofs < strbeg)
2417 startpos = strbeg;
2418 else
2419 startpos -= prog->gofs;
03c83e26 2420 }
58430ea8 2421 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2422 startpos = strbeg;
03c83e26
DM
2423 }
2424
2425 minlen = prog->minlen;
b342a604 2426 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2427 DEBUG_r(PerlIO_printf(Perl_debug_log,
2428 "Regex match can't succeed, so not even tried\n"));
2429 return 0;
2430 }
2431
63a3746a
DM
2432 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2433 * which will call destuctors to reset PL_regmatch_state, free higher
2434 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2435 * regmatch_info_aux_eval */
2436
2437 oldsave = PL_savestack_ix;
2438
dfa77d06
DM
2439 s = startpos;
2440
e322109a 2441 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2442 && !(flags & REXEC_CHECKED))
2443 {
dfa77d06 2444 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2445 flags, NULL);
dfa77d06 2446 if (!s)
7fadf4a7
DM
2447 return 0;
2448
e322109a 2449 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2450 /* we can match based purely on the result of INTUIT.
2451 * Set up captures etc just for $& and $-[0]
2452 * (an intuit-only match wont have $1,$2,..) */
2453 assert(!prog->nparens);
d5e7783a
DM
2454
2455 /* s/// doesn't like it if $& is earlier than where we asked it to
2456 * start searching (which can happen on something like /.\G/) */
2457 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2458 && (s < stringarg))
2459 {
2460 /* this should only be possible under \G */
58430ea8 2461 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2462 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2463 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2464 goto phooey;
2465 }
2466
7fadf4a7
DM
2467 /* match via INTUIT shouldn't have any captures.
2468 * Let @-, @+, $^N know */
2469 prog->lastparen = prog->lastcloseparen = 0;
2470 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2471 prog->offs[0].start = s - strbeg;
2472 prog->offs[0].end = utf8_target
2473 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2474 : s - strbeg + prog->minlenret;
7fadf4a7 2475 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2476 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2477 strbeg, strend,
2478 sv, flags, utf8_target);
2479
7fadf4a7
DM
2480 return 1;
2481 }
2482 }
2483
6c3fea77 2484 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2485
dfa77d06 2486 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2487 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2488 "String too short [regexec_flags]...\n"));
2489 goto phooey;
1aa99e6b 2490 }
1de06328 2491
6eb5f6b9 2492 /* Check validity of program. */
f8fc2ecf 2493 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2494 Perl_croak(aTHX_ "corrupted regexp program");
2495 }
2496
272d35c9 2497 RX_MATCH_TAINTED_off(rx);
6eb5f6b9 2498
6c3fea77
DM
2499 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2500 reginfo->intuit = 0;
2501 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2502 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2503 reginfo->warned = FALSE;
9d9163fb 2504 reginfo->strbeg = strbeg;
02d5137b 2505 reginfo->sv = sv;
1cb48e53 2506 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2507 reginfo->strend = strend;
6eb5f6b9 2508 /* see how far we have to get to not match where we matched before */
fe3974be 2509 reginfo->till = stringarg + minend;
6eb5f6b9 2510
82c23608
FC
2511 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2512 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2513 S_cleanup_regmatch_info_aux has executed (registered by
2514 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2515 magic belonging to this SV.
2516 Not newSVsv, either, as it does not COW.
2517 */
2518 reginfo->sv = newSV(0);
4cba5ac0 2519 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2520 SAVEFREESV(reginfo->sv);
2521 }
2522
331b2dcc
DM
2523 /* reserve next 2 or 3 slots in PL_regmatch_state:
2524 * slot N+0: may currently be in use: skip it
2525 * slot N+1: use for regmatch_info_aux struct
2526 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2527 * slot N+3: ready for use by regmatch()
2528 */
bf2039a9 2529
331b2dcc
DM
2530 {
2531 regmatch_state *old_regmatch_state;
2532 regmatch_slab *old_regmatch_slab;
2533 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2534
2535 /* on first ever match, allocate first slab */
2536 if (!PL_regmatch_slab) {
2537 Newx(PL_regmatch_slab, 1, regmatch_slab);
2538 PL_regmatch_slab->prev = NULL;
2539 PL_regmatch_slab->next = NULL;
2540 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2541 }
bf2039a9 2542
331b2dcc
DM
2543 old_regmatch_state = PL_regmatch_state;
2544 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2545
331b2dcc
DM
2546 for (i=0; i <= max; i++) {
2547 if (i == 1)
2548 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2549 else if (i ==2)
2550 reginfo->info_aux_eval =
2551 reginfo->info_aux->info_aux_eval =
2552 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2553
331b2dcc
DM
2554 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2555 PL_regmatch_state = S_push_slab(aTHX);
2556 }
bf2039a9 2557
331b2dcc
DM
2558 /* note initial PL_regmatch_state position; at end of match we'll
2559 * pop back to there and free any higher slabs */
bf2039a9 2560
331b2dcc
DM
2561 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2562 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2563 reginfo->info_aux->poscache = NULL;
bf2039a9 2564
331b2dcc 2565 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2566
331b2dcc
DM
2567 if ((prog->extflags & RXf_EVAL_SEEN))
2568 S_setup_eval_state(aTHX_ reginfo);
2569 else
2570 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2571 }
d3aa529c 2572
6eb5f6b9 2573 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2574
288b8c02 2575 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2576 /* We have to be careful. If the previous successful match
2577 was from this regex we don't want a subsequent partially
2578 successful match to clobber the old results.
2579 So when we detect this possibility we add a swap buffer
d8da0584
KW
2580 to the re, and switch the buffer each match. If we fail,
2581 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2582 */
2583 swap = prog->offs;
2584 /* do we need a save destructor here for eval dies? */
2585 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2586 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2587 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2588 PTR2UV(prog),
2589 PTR2UV(swap),
2590 PTR2UV(prog->offs)
2591 ));
c74340f9 2592 }
6eb5f6b9
JH
2593
2594 /* Simplest case: anchored match need be tried only once. */
2595 /* [unless only anchor is BOL and multiline is set] */
8e1490ee 2596 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3542935d 2597 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2598 goto got_it;
8e1490ee 2599 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
6eb5f6b9
JH
2600 {
2601 char *end;
2602
2603 if (minlen)
2604 dontbother = minlen - 1;
1aa99e6b 2605 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2606 /* for multiline we only have to try after newlines */
33b8afdf 2607 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2608 /* because of the goto we can not easily reuse the macros for bifurcating the
2609 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2610 if (utf8_target) {
2611 if (s == startpos)
2612 goto after_try_utf8;
2613 while (1) {
02d5137b 2614 if (regtry(reginfo, &s)) {
92f3d482
YO
2615 goto got_it;
2616 }
2617 after_try_utf8:
2618 if (s > end) {
2619 goto phooey;
2620 }
2621 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2622 s = re_intuit_start(rx, sv, strbeg,
2623 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2624 if (!s) {
2625 goto phooey;
2626 }
2627 }
2628 else {
2629 s += UTF8SKIP(s);
2630 }
2631 }
2632 } /* end search for check string in unicode */
2633 else {
2634 if (s == startpos) {
2635 goto after_try_latin;
2636 }
2637 while (1) {
02d5137b 2638 if (regtry(reginfo, &s)) {
92f3d482
YO
2639 goto got_it;
2640 }
2641 after_try_latin:
2642 if (s > end) {
2643 goto phooey;
2644 }
2645 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2646 s = re_intuit_start(rx, sv, strbeg,
2647 s + 1, strend, flags, NULL);
92f3d482
YO
2648 if (!s) {
2649 goto phooey;
2650 }
2651 }
2652 else {
2653 s++;
2654 }
2655 }
2656 } /* end search for check string in latin*/
2657 } /* end search for check string */
2658 else { /* search for newline */
2659 if (s > startpos) {
2660 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2661 s--;
92f3d482 2662 }
21eede78
YO
2663 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2664 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2665 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2666 if (regtry(reginfo, &s))
6eb5f6b9
JH
2667 goto got_it;
2668 }
92f3d482
YO
2669 }
2670 } /* end search for newline */
2671 } /* end anchored/multiline check string search */
6eb5f6b9 2672 goto phooey;
a8430a8b 2673 } else if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 2674 {
a8430a8b
YO
2675 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2676 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
2677 /* For anchored \G, the only position it can match from is
2678 * (ganch-gofs); we already set startpos to this above; if intuit
2679 * moved us on from there, we can't possibly succeed */
2680 assert(startpos == reginfo->ganch - prog->gofs);
2681 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2682 goto got_it;
2683 goto phooey;
2684 }
2685
2686 /* Messy cases: unanchored match. */
bbe252da 2687 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2688 /* we have /x+whatever/ */
984e6dd1 2689 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2690 char ch;
bf93d4cc
GS
2691#ifdef DEBUGGING
2692 int did_match = 0;
2693#endif
f2ed9b32 2694 if (utf8_target) {
7e0d5ad7
KW
2695 if (! prog->anchored_utf8) {
2696 to_utf8_substr(prog);
2697 }
2698 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2699 REXEC_FBC_SCAN(
6eb5f6b9 2700 if (*s == ch) {
a3621e74 2701 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2702 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2703 s += UTF8SKIP(s);
2704 while (s < strend && *s == ch)
2705 s += UTF8SKIP(s);
2706 }
4cadc6a9 2707 );
7e0d5ad7 2708
6eb5f6b9
JH
2709 }
2710 else {
7e0d5ad7
KW
2711 if (! prog->anchored_substr) {
2712 if (! to_byte_substr(prog)) {
6b54ddc5 2713 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2714 }
2715 }
2716 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2717 REXEC_FBC_SCAN(
6eb5f6b9 2718 if (*s == ch) {
a3621e74 2719 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2720 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2721 s++;
2722 while (s < strend && *s == ch)
2723 s++;
2724 }
4cadc6a9 2725 );
6eb5f6b9 2726 }
a3621e74 2727 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2728 PerlIO_printf(Perl_debug_log,
b7953727
JH
2729 "Did not find anchored character...\n")
2730 );
6eb5f6b9 2731 }
a0714e2c
SS
2732 else if (prog->anchored_substr != NULL
2733 || prog->anchored_utf8 != NULL
2734 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2735 && prog->float_max_offset < strend - s)) {
2736 SV *must;
ea3daa5d
FC
2737 SSize_t back_max;
2738 SSize_t back_min;
33b8afdf 2739 char *last;
6eb5f6b9 2740 char *last1; /* Last position checked before */
bf93d4cc
GS
2741#ifdef DEBUGGING
2742 int did_match = 0;
2743#endif
33b8afdf 2744 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2745 if (utf8_target) {
2746 if (! prog->anchored_utf8) {
2747 to_utf8_substr(prog);
2748 }
2749 must = prog->anchored_utf8;
2750 }
2751 else {
2752 if (! prog->anchored_substr) {
2753 if (! to_byte_substr(prog)) {
6b54ddc5 2754 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2755 }
2756 }
2757 must = prog->anchored_substr;
2758 }
33b8afdf
JH
2759 back_max = back_min = prog->anchored_offset;
2760 } else {
7e0d5ad7
KW
2761 if (utf8_target) {
2762 if (! prog->float_utf8) {
2763 to_utf8_substr(prog);
2764 }
2765 must = prog->float_utf8;
2766 }
2767 else {
2768 if (! prog->float_substr) {
2769 if (! to_byte_substr(prog)) {
6b54ddc5 2770 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2771 }
2772 }
2773 must = prog->float_substr;
2774 }
33b8afdf
JH
2775 back_max = prog->float_max_offset;
2776 back_min = prog->float_min_offset;
2777 }
1de06328 2778
1de06328
YO
2779 if (back_min<0) {
2780 last = strend;
2781 } else {
2782 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 2783 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
2784 - (SvTAIL(must) != 0) + back_min), strbeg);
2785 }
9d9163fb 2786 if (s > reginfo->strbeg)
6eb5f6b9
JH
2787 last1 = HOPc(s, -1);
2788 else
2789 last1 = s - 1; /* bogus */
2790
a0288114 2791 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2792 check_substr==must. */
bf05793b 2793 dontbother = 0;
6eb5f6b9
JH
2794 strend = HOPc(strend, -dontbother);
2795 while ( (s <= last) &&
e50d57d4 2796 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 2797 (unsigned char*)strend, must,
c33e64f0 2798 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2799 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2800 if (HOPc(s, -back_max) > last1) {
2801 last1 = HOPc(s, -back_min);
2802 s = HOPc(s, -back_max);
2803 }
2804 else {
9d9163fb
DM
2805 char * const t = (last1 >= reginfo->strbeg)
2806 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2807
2808 last1 = HOPc(s, -back_min);
52657f30 2809 s = t;
6eb5f6b9 2810 }
f2ed9b32 2811 if (utf8_target) {
6eb5f6b9 2812 while (s <= last1) {
02d5137b 2813 if (regtry(reginfo, &s))
6eb5f6b9 2814 goto got_it;
7016d6eb
DM
2815 if (s >= last1) {
2816 s++; /* to break out of outer loop */
2817 break;
2818 }
2819 s += UTF8SKIP(s);
6eb5f6b9
JH
2820 }
2821 }
2822 else {
2823 while (s <= last1) {
02d5137b 2824 if (regtry(reginfo, &s))
6eb5f6b9
JH
2825 goto got_it;
2826 s++;
2827 }
2828 }
2829 }
ab3bbdeb 2830 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2831 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2832 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2833 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2834 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2835 ? "anchored" : "floating"),
ab3bbdeb
YO
2836 quoted, RE_SV_TAIL(must));
2837 });
6eb5f6b9
JH
2838 goto phooey;
2839 }
f8fc2ecf 2840 else if ( (c = progi->regstclass) ) {
f14c76ed 2841 if (minlen) {
f8fc2ecf 2842 const OPCODE op = OP(progi->regstclass);
66e933ab 2843 /* don't bother with what can't match */
786e8c11 2844 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2845 strend = HOPc(strend, -(minlen - 1));
2846 }
a3621e74 2847 DEBUG_EXECUTE_r({
be8e71aa 2848 SV * const prop = sv_newmortal();
32fc9b6a 2849 regprop(prog, prop, c);
0df25f3d 2850 {
f2ed9b32 2851 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2852 s,strend-s,60);
0df25f3d 2853 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2854 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2855 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2856 quoted, (int)(strend - s));
0df25f3d 2857 }
ffc61ed2 2858 });
f9176b44 2859 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2860 goto got_it;
07be1b83 2861 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2862 }
2863 else {
2864 dontbother = 0;
a0714e2c 2865 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2866 /* Trim the end. */
6af40bd7 2867 char *last= NULL;
33b8afdf 2868 SV* float_real;
c33e64f0
FC
2869 STRLEN len;
2870 const char *little;
33b8afdf 2871
7e0d5ad7
KW
2872 if (utf8_target) {
2873 if (! prog->float_utf8) {
2874 to_utf8_substr(prog);
2875 }
2876 float_real = prog->float_utf8;
2877 }
2878 else {
2879 if (! prog->float_substr) {
2880 if (! to_byte_substr(prog)) {
6b54ddc5 2881 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2882 }
2883 }
2884 float_real = prog->float_substr;
2885 }
d6a28714 2886
c33e64f0
FC
2887 little = SvPV_const(float_real, len);
2888 if (SvTAIL(float_real)) {
7f18ad16
KW
2889 /* This means that float_real contains an artificial \n on
2890 * the end due to the presence of something like this:
2891 * /foo$/ where we can match both "foo" and "foo\n" at the
2892 * end of the string. So we have to compare the end of the
2893 * string first against the float_real without the \n and
2894 * then against the full float_real with the string. We
2895 * have to watch out for cases where the string might be
2896 * smaller than the float_real or the float_real without
2897 * the \n. */
1a13b075
YO
2898 char *checkpos= strend - len;
2899 DEBUG_OPTIMISE_r(
2900 PerlIO_printf(Perl_debug_log,
2901 "%sChecking for float_real.%s\n",
2902 PL_colors[4], PL_colors[5]));
2903 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2904 /* can't match, even if we remove the trailing \n
2905 * string is too short to match */
1a13b075
YO
2906 DEBUG_EXECUTE_r(
2907 PerlIO_printf(Perl_debug_log,
2908 "%sString shorter than required trailing substring, cannot match.%s\n",
2909 PL_colors[4], PL_colors[5]));
2910 goto phooey;
2911 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2912 /* can match, the end of the string matches without the
2913 * "\n" */
1a13b075
YO
2914 last = checkpos + 1;
2915 } else if (checkpos < strbeg) {
7f18ad16
KW
2916 /* cant match, string is too short when the "\n" is
2917 * included */
1a13b075
YO
2918 DEBUG_EXECUTE_r(
2919 PerlIO_printf(Perl_debug_log,
2920 "%sString does not contain required trailing substring, cannot match.%s\n",
2921 PL_colors[4], PL_colors[5]));
2922 goto phooey;
2923 } else if (!multiline) {
7f18ad16
KW
2924 /* non multiline match, so compare with the "\n" at the
2925 * end of the string */
1a13b075
YO
2926 if (memEQ(checkpos, little, len)) {
2927 last= checkpos;
2928 } else {
2929 DEBUG_EXECUTE_r(
2930 PerlIO_printf(Perl_debug_log,
2931 "%sString does not contain required trailing substring, cannot match.%s\n",
2932 PL_colors[4], PL_colors[5]));
2933 goto phooey;
2934 }
2935 } else {
7f18ad16
KW
2936 /* multiline match, so we have to search for a place
2937 * where the full string is located */
d6a28714 2938 goto find_last;
1a13b075 2939 }
c33e64f0 2940 } else {
d6a28714 2941 find_last:
9041c2e3 2942 if (len)
d6a28714 2943 last = rninstr(s, strend, little, little + len);
b8c5462f 2944 else
a0288114 2945 last = strend; /* matching "$" */
b8c5462f 2946 }
6af40bd7 2947 if (!last) {
7f18ad16
KW
2948 /* at one point this block contained a comment which was
2949 * probably incorrect, which said that this was a "should not
2950 * happen" case. Even if it was true when it was written I am
2951 * pretty sure it is not anymore, so I have removed the comment
2952 * and replaced it with this one. Yves */
6bda09f9
YO
2953 DEBUG_EXECUTE_r(
2954 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2955 "String does not contain required substring, cannot match.\n"
2956 ));
2957 goto phooey;
bf93d4cc 2958 }
d6a28714
JH
2959 dontbother = strend - last + prog->float_min_offset;
2960 }
2961 if (minlen && (dontbother < minlen))
2962 dontbother = minlen - 1;
2963 strend -= dontbother; /* this one's always in bytes! */
2964 /* We don't know much -- general case. */
f2ed9b32 2965 if (utf8_target) {
d6a28714 2966 for (;;) {
02d5137b 2967 if (regtry(reginfo, &s))
d6a28714
JH
2968 goto got_it;
2969 if (s >= strend)
2970 break;
b8c5462f 2971 s += UTF8SKIP(s);
d6a28714
JH
2972 };
2973 }
2974 else {
2975 do {
02d5137b 2976 if (regtry(reginfo, &s))
d6a28714
JH
2977 goto got_it;
2978 } while (s++ < strend);
2979 }
2980 }
2981
2982 /* Failure. */
2983 goto phooey;
2984
2985got_it:
d5e7783a
DM
2986 /* s/// doesn't like it if $& is earlier than where we asked it to
2987 * start searching (which can happen on something like /.\G/) */
2988 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2989 && (prog->offs[0].start < stringarg - strbeg))
2990 {
2991 /* this should only be possible under \G */
58430ea8 2992 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2993 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2994 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2995 goto phooey;
2996 }
2997
495f47a5
DM
2998 DEBUG_BUFFERS_r(
2999 if (swap)
3000 PerlIO_printf(Perl_debug_log,
3001 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3002 PTR2UV(prog),
3003 PTR2UV(swap)
3004 );
3005 );
e9105d30 3006 Safefree(swap);
d6a28714 3007
bf2039a9
DM
3008 /* clean up; this will trigger destructors that will free all slabs
3009 * above the current one, and cleanup the regmatch_info_aux
3010 * and regmatch_info_aux_eval sructs */
8adc0f72 3011
006f26b2
DM
3012 LEAVE_SCOPE(oldsave);
3013
5daac39c
NC
3014 if (RXp_PAREN_NAMES(prog))
3015 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 3016
0254aed9
DM
3017 RX_MATCH_UTF8_set(rx, utf8_target);
3018
d6a28714 3019 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3020 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3021 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3022 strbeg, reginfo->strend,
3023 sv, flags, utf8_target);
9041c2e3 3024
d6a28714
JH
3025 return 1;
3026
3027phooey:
a3621e74 3028 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3029 PL_colors[4], PL_colors[5]));
8adc0f72 3030
bf2039a9
DM
3031 /* clean up; this will trigger destructors that will free all slabs
3032 * above the current one, and cleanup the regmatch_info_aux
3033 * and regmatch_info_aux_eval sructs */
8adc0f72 3034
006f26b2
DM
3035 LEAVE_SCOPE(oldsave);
3036
e9105d30 3037 if (swap) {
c74340f9 3038 /* we failed :-( roll it back */
495f47a5
DM
3039 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3040 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3041 PTR2UV(prog),
3042 PTR2UV(prog->offs),
3043 PTR2UV(swap)
3044 ));
e9105d30
GG
3045 Safefree(prog->offs);
3046 prog->offs = swap;
3047 }
d6a28714
JH
3048 return 0;
3049}
3050
6bda09f9 3051
b3d298be 3052/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3053 * Do inc before dec, in case old and new rex are the same */
baa60164 3054#define SET_reg_curpm(Re2) \
bf2039a9 3055 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3056 (void)ReREFCNT_inc(Re2); \
3057 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3058 PM_SETRE((PL_reg_curpm), (Re2)); \
3059 }
3060
3061
d6a28714
JH
3062/*
3063 - regtry - try match at specific point
3064 */
3065STATIC I32 /* 0 failure, 1 success */
f73aaa43 3066S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3067{
97aff369 3068 dVAR;
d6a28714 3069 CHECKPOINT lastcp;
288b8c02 3070 REGEXP *const rx = reginfo->prog;
8d919b0a 3071 regexp *const prog = ReANY(rx);
99a90e59 3072 SSize_t result;
f8fc2ecf 3073 RXi_GET_DECL(prog,progi);
a3621e74 3074 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3075
3076 PERL_ARGS_ASSERT_REGTRY;
3077
24b23f37 3078 reginfo->cutpoint=NULL;
d6a28714 3079
9d9163fb 3080 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3081 prog->lastparen = 0;
03994de8 3082 prog->lastcloseparen = 0;
d6a28714
JH
3083
3084 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3085 to do this again and again, prog->lastparen should take care of
3dd2943c 3086 this! --ilya*/
dafc8851
JH
3087
3088 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3089 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3090 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3091 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3092 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3093 * Meanwhile, this code *is* needed for the
daf18116
JH
3094 * above-mentioned test suite tests to succeed. The common theme
3095 * on those tests seems to be returning null fields from matches.
225593e1 3096 * --jhi updated by dapm */
dafc8851 3097#if 1
d6a28714 3098 if (prog->nparens) {
b93070ed 3099 regexp_paren_pair *pp = prog->offs;
eb578fdb 3100 I32 i;
b93070ed 3101 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3102 ++pp;
3103 pp->start = -1;
3104 pp->end = -1;
d6a28714
JH
3105 }
3106 }
dafc8851 3107#endif
02db2b7b 3108 REGCP_SET(lastcp);
f73aaa43
DM
3109 result = regmatch(reginfo, *startposp, progi->program + 1);
3110 if (result != -1) {
3111 prog->offs[0].end = result;
d6a28714
JH
3112 return 1;
3113 }
24b23f37 3114 if (reginfo->cutpoint)
f73aaa43 3115 *startposp= reginfo->cutpoint;
02db2b7b 3116 REGCP_UNWIND(lastcp);
d6a28714
JH
3117 return 0;
3118}
3119
02db2b7b 3120
8ba1375e
MJD
3121#define sayYES goto yes
3122#define sayNO goto no
262b90c4 3123#define sayNO_SILENT goto no_silent
8ba1375e 3124
f9f4320a
YO
3125/* we dont use STMT_START/END here because it leads to
3126 "unreachable code" warnings, which are bogus, but distracting. */
3127#define CACHEsayNO \
c476f425 3128 if (ST.cache_mask) \
2ac8ff4b 3129 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3130 sayNO
3298f257 3131
a3621e74 3132/* this is used to determine how far from the left messages like
265c4333
YO
3133 'failed...' are printed. It should be set such that messages
3134 are inline with the regop output that created them.
a3621e74 3135*/
265c4333 3136#define REPORT_CODE_OFF 32
a3621e74
YO
3137
3138
40a82448
DM
3139#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3140#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3141#define CHRTEST_NOT_A_CP_1 -999
3142#define CHRTEST_NOT_A_CP_2 -998
9e137952 3143
5d9a96ca
DM
3144/* grab a new slab and return the first slot in it */
3145
3146STATIC regmatch_state *
3147S_push_slab(pTHX)
3148{
a35a87e7 3149#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3150 dMY_CXT;
3151#endif
5d9a96ca
DM
3152 regmatch_slab *s = PL_regmatch_slab->next;
3153 if (!s) {
3154 Newx(s, 1, regmatch_slab);
3155 s->prev = PL_regmatch_slab;
3156 s->next = NULL;
3157 PL_regmatch_slab->next = s;
3158 }
3159 PL_regmatch_slab = s;
86545054 3160 return SLAB_FIRST(s);
5d9a96ca 3161}
5b47454d 3162
95b24440 3163
40a82448
DM
3164/* push a new state then goto it */
3165
4d5016e5
DM
3166#define PUSH_STATE_GOTO(state, node, input) \
3167 pushinput = input; \
40a82448
DM
3168 scan = node; \
3169 st->resume_state = state; \
3170 goto push_state;
3171
3172/* push a new state with success backtracking, then goto it */
3173
4d5016e5
DM
3174#define PUSH_YES_STATE_GOTO(state, node, input) \
3175 pushinput = input; \
40a82448
DM
3176 scan = node; \
3177 st->resume_state = state; \
3178 goto push_yes_state;
3179
aa283a38 3180
aa283a38 3181
4d5016e5 3182
d6a28714 3183/*
95b24440 3184
bf1f174e
DM
3185regmatch() - main matching routine
3186
3187This is basically one big switch statement in a loop. We execute an op,
3188set 'next' to point the next op, and continue. If we come to a point which
3189we may need to backtrack to on failure such as (A|B|C), we push a
3190backtrack state onto the backtrack stack. On failure, we pop the top
3191state, and re-enter the loop at the state indicated. If there are no more
3192states to pop, we return failure.
3193
3194Sometimes we also need to backtrack on success; for example /A+/, where
3195after successfully matching one A, we need to go back and try to
3196match another one; similarly for lookahead assertions: if the assertion
3197completes successfully, we backtrack to the state just before the assertion
3198and then carry on. In these cases, the pushed state is marked as
3199'backtrack on success too'. This marking is in fact done by a chain of
3200pointers, each pointing to the previous 'yes' state. On success, we pop to
3201the nearest yes state, discarding any intermediate failure-only states.
3202Sometimes a yes state is pushed just to force some cleanup code to be
3203called at the end of a successful match or submatch; e.g. (??{$re}) uses
3204it to free the inner regex.
3205
3206Note that failure backtracking rewinds the cursor position, while
3207success backtracking leaves it alone.
3208
3209A pattern is complete when the END op is executed, while a subpattern
3210such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3211ops trigger the "pop to last yes state if any, otherwise return true"
3212behaviour.
3213
3214A common convention in this function is to use A and B to refer to the two
3215subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3216the subpattern to be matched possibly multiple times, while B is the entire
3217rest of the pattern. Variable and state names reflect this convention.
3218
3219The states in the main switch are the union of ops and failure/success of
3220substates associated with with that op. For example, IFMATCH is the op
3221that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3222'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3223successfully matched A and IFMATCH_A_fail is a state saying that we have
3224just failed to match A. Resume states always come in pairs. The backtrack
3225state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3226at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3227on success or failure.
3228
3229The struct that holds a backtracking state is actually a big union, with
3230one variant for each major type of op. The variable st points to the
3231top-most backtrack struct. To make the code clearer, within each
3232block of code we #define ST to alias the relevant union.
3233
3234Here's a concrete example of a (vastly oversimplified) IFMATCH
3235implementation:
3236
3237 switch (state) {
3238 ....
3239
3240#define ST st->u.ifmatch
3241
3242 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3243 ST.foo = ...; // some state we wish to save
95b24440 3244 ...
bf1f174e
DM
3245 // push a yes backtrack state with a resume value of
3246 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3247 // first node of A:
4d5016e5 3248 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3249 // NOTREACHED
3250
3251 case IFMATCH_A: // we have successfully executed A; now continue with B
3252 next = B;
3253 bar = ST.foo; // do something with the preserved value
3254 break;
3255
3256 case IFMATCH_A_fail: // A failed, so the assertion failed
3257 ...; // do some housekeeping, then ...
3258 sayNO; // propagate the failure
3259
3260#undef ST
95b24440 3261
bf1f174e
DM
3262 ...
3263 }
95b24440 3264
bf1f174e
DM
3265For any old-timers reading this who are familiar with the old recursive
3266approach, the code above is equivalent to:
95b24440 3267
bf1f174e
DM
3268 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3269 {
3270 int foo = ...
95b24440 3271 ...
bf1f174e
DM
3272 if (regmatch(A)) {
3273 next = B;
3274 bar = foo;
3275 break;
95b24440 3276 }
bf1f174e
DM
3277 ...; // do some housekeeping, then ...
3278 sayNO; // propagate the failure
95b24440 3279 }
bf1f174e
DM
3280
3281The topmost backtrack state, pointed to by st, is usually free. If you
3282want to claim it, populate any ST.foo fields in it with values you wish to
3283save, then do one of
3284
4d5016e5
DM
3285 PUSH_STATE_GOTO(resume_state, node, newinput);
3286 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3287
3288which sets that backtrack state's resume value to 'resume_state', pushes a
3289new free entry to the top of the backtrack stack, then goes to 'node'.
3290On backtracking, the free slot is popped, and the saved state becomes the
3291new free state. An ST.foo field in this new top state can be temporarily
3292accessed to retrieve values, but once the main loop is re-entered, it
3293becomes available for reuse.
3294
3295Note that the depth of the backtrack stack constantly increases during the
3296left-to-right execution of the pattern, rather than going up and down with
3297the pattern nesting. For example the stack is at its maximum at Z at the
3298end of the pattern, rather than at X in the following:
3299
3300 /(((X)+)+)+....(Y)+....Z/
3301
3302The only exceptions to this are lookahead/behind assertions and the cut,
3303(?>A), which pop all the backtrack states associated with A before
3304continuing.
3305
486ec47a 3306Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3307PL_regmatch_state and st always point to the currently active state,
3308and PL_regmatch_slab points to the slab currently containing
3309PL_regmatch_state. The first time regmatch() is called, the first slab is
3310allocated, and is never freed until interpreter destruction. When the slab
3311is full, a new one is allocated and chained to the end. At exit from
3312regmatch(), slabs allocated since entry are freed.
3313
3314*/
95b24440 3315
40a82448 3316
5bc10b2c 3317#define DEBUG_STATE_pp(pp) \
265c4333 3318 DEBUG_STATE_r({ \
baa60164 3319 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3320 PerlIO_printf(Perl_debug_log, \
5d458dd8 3321 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3322 depth*2, "", \
baa60164 3323 PL_reg_name[st->resume_state], \
5d458dd8
YO
3324 ((st==yes_state||st==mark_state) ? "[" : ""), \
3325 ((st==yes_state) ? "Y" : ""), \
3326 ((st==mark_state) ? "M" : ""), \
3327 ((st==yes_state||st==mark_state) ? "]" : "") \
3328 ); \
265c4333 3329 });
5bc10b2c 3330
40a82448 3331
3dab1dad 3332#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3333
3df15adc 3334#ifdef DEBUGGING
5bc10b2c 3335
ab3bbdeb 3336STATIC void
f2ed9b32 3337S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3338 const char *start, const char *end, const char *blurb)
3339{
efd26800 3340 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3341
3342 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3343
ab3bbdeb
YO
3344 if (!PL_colorset)
3345 reginitcolors();
3346 {
3347 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3348 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3349
f2ed9b32 3350 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3351 start, end - start, 60);
3352
3353 PerlIO_printf(Perl_debug_log,
3354 "%s%s REx%s %s against %s\n",
3355 PL_colors[4], blurb, PL_colors[5], s0, s1);
3356
f2ed9b32 3357 if (utf8_target||utf8_pat)
1de06328
YO
3358 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3359 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3360 utf8_pat && utf8_target ? " and " : "",
3361 utf8_target ? "string" : ""
ab3bbdeb
YO
3362 );
3363 }
3364}
3df15adc
YO
3365
3366STATIC void
786e8c11
YO
3367S_dump_exec_pos(pTHX_ const char *locinput,
3368 const regnode *scan,
3369 const char *loc_regeol,
3370 const char *loc_bostr,
3371 const char *loc_reg_starttry,
f2ed9b32 3372 const bool utf8_target)
07be1b83 3373{
786e8c11 3374 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3375 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3376 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3377 /* The part of the string before starttry has one color
3378 (pref0_len chars), between starttry and current
3379 position another one (pref_len - pref0_len chars),
3380 after the current position the third one.
3381 We assume that pref0_len <= pref_len, otherwise we
3382 decrease pref0_len. */
786e8c11
YO
3383 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3384 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3385 int pref0_len;
3386
7918f24d
NC
3387 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3388
f2ed9b32 3389 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3390 pref_len++;
786e8c11
YO
3391 pref0_len = pref_len - (locinput - loc_reg_starttry);
3392 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3393 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3394 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3395 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3396 l--;
3397 if (pref0_len < 0)
3398 pref0_len = 0;
3399 if (pref0_len > pref_len)
3400 pref0_len = pref_len;
3401 {
f2ed9b32 3402 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3403
ab3bbdeb 3404 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3405 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3406
ab3bbdeb 3407 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3408 (locinput - pref_len + pref0_len),
1de06328 3409 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3410
ab3bbdeb 3411 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3412 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3413
1de06328 3414 const STRLEN tlen=len0+len1+len2;
3df15adc 3415 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3416 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3417 (IV)(locinput - loc_bostr),
07be1b83 3418 len0, s0,
07be1b83 3419 len1, s1,
07be1b83 3420 (docolor ? "" : "> <"),
07be1b83 3421 len2, s2,
f9f4320a 3422 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3423 "");
3424 }
3425}
3df15adc 3426
07be1b83
YO
3427#endif
3428
0a4db386
YO
3429/* reg_check_named_buff_matched()
3430 * Checks to see if a named buffer has matched. The data array of
3431 * buffer numbers corresponding to the buffer is expected to reside
3432 * in the regexp->data->data array in the slot stored in the ARG() of
3433 * node involved. Note that this routine doesn't actually care about the
3434 * name, that information is not preserved from compilation to execution.
3435 * Returns the index of the leftmost defined buffer with the given name
3436 * or 0 if non of the buffers matched.
3437 */
3438STATIC I32
7918f24d
NC
3439S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3440{
0a4db386 3441 I32 n;
f8fc2ecf 3442 RXi_GET_DECL(rex,rexi);
ad64d0ec 3443 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3444 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3445
3446 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3447
0a4db386 3448 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3449 if ((I32)rex->lastparen >= nums[n] &&
3450 rex->offs[nums[n]].end != -1)
0a4db386
YO
3451 {
3452 return nums[n];
3453 }
3454 }
3455 return 0;
3456}
3457
2f554ef7 3458
c74f6de9 3459static bool
984e6dd1 3460S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3461 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3462{
79a2a0e8
KW
3463 /* This function determines if there are one or two characters that match
3464 * the first character of the passed-in EXACTish node <text_node>, and if
3465 * so, returns them in the passed-in pointers.
c74f6de9 3466 *
79a2a0e8
KW
3467 * If it determines that no possible character in the target string can
3468 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3469 * the first character in <text_node> requires UTF-8 to represent, and the
3470 * target string isn't in UTF-8.)
c74f6de9 3471 *
79a2a0e8
KW
3472 * If there are more than two characters that could match the beginning of
3473 * <text_node>, or if more context is required to determine a match or not,
3474 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3475 *
3476 * The motiviation behind this function is to allow the caller to set up
3477 * tight loops for matching. If <text_node> is of type EXACT, there is
3478 * only one possible character that can match its first character, and so
3479 * the situation is quite simple. But things get much more complicated if
3480 * folding is involved. It may be that the first character of an EXACTFish
3481 * node doesn't participate in any possible fold, e.g., punctuation, so it
3482 * can be matched only by itself. The vast majority of characters that are
3483 * in folds match just two things, their lower and upper-case equivalents.
3484 * But not all are like that; some have multiple possible matches, or match
3485 * sequences of more than one character. This function sorts all that out.
3486 *
3487 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3488 * loop of trying to match A*, we know we can't exit where the thing
3489 * following it isn't a B. And something can't be a B unless it is the
3490 * beginning of B. By putting a quick test for that beginning in a tight
3491 * loop, we can rule out things that can't possibly be B without having to
3492 * break out of the loop, thus avoiding work. Similarly, if A is a single
3493 * character, we can make a tight loop matching A*, using the outputs of
3494 * this function.
3495 *
3496 * If the target string to match isn't in UTF-8, and there aren't
3497 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3498 * the one or two possible octets (which are characters in this situation)
3499 * that can match. In all cases, if there is only one character that can
3500 * match, *<c1p> and *<c2p> will be identical.
3501 *
3502 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3503 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3504 * can match the beginning of <text_node>. They should be declared with at
3505 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3506 * undefined what these contain.) If one or both of the buffers are
3507 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3508 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3509 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3510 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3511 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9 3512
ba44c216 3513 const bool utf8_target = reginfo->is_utf8_target;
79a2a0e8 3514