This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re_intuit_start(): de-duplicate condition
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
e1cf74e3
CB
86#ifdef DEBUGGING
87/* At least one required character in the target string is expressible only in
88 * UTF-8. */
89static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
91#endif
92
93#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
95 goto target; \
96} STMT_END
97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
2d66f61e 104/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
3db24e1e 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 115#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad 117#define HOPc(pos,off) \
ba44c216 118 (char *)(reginfo->is_utf8_target \
220db18a 119 ? reghop3((U8*)pos, off, \
9d9163fb 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad
YO
121 : (U8*)(pos + off))
122#define HOPBACKc(pos, off) \
ba44c216 123 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
124 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
125 : (pos - off >= reginfo->strbeg) \
8e11feef 126 ? (U8*)pos - off \
3dab1dad 127 : NULL)
efb30f32 128
ba44c216 129#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 130#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 131
8e9f2289
DM
132/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
133 * off must be >=0; args should be vars rather than expressions */
134#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
135 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
136 : (U8*)((pos + off) > lim ? lim : (pos + off)))
137
2974eaec
DM
138#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
139 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
140 : (U8*)(pos + off))
141#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
142
143#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
144#define NEXTCHR_IS_EOS (nextchr < 0)
145
146#define SET_nextchr \
220db18a 147 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
148
149#define SET_locinput(p) \
150 locinput = (p); \
151 SET_nextchr
152
153
2a16ac92 154#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
155 if (!swash_ptr) { \
156 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 157 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 158 1, 0, invlist, &flags); \
c7304fe2
KW
159 assert(swash_ptr); \
160 } \
161 } STMT_END
162
163/* If in debug mode, we test that a known character properly matches */
164#ifdef DEBUGGING
165# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
166 property_name, \
2a16ac92 167 invlist, \
c7304fe2 168 utf8_char_in_property) \
2a16ac92 169 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
170 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
171#else
172# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
173 property_name, \
2a16ac92 174 invlist, \
c7304fe2 175 utf8_char_in_property) \
2a16ac92 176 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 177#endif
d1eb3177 178
c7304fe2
KW
179#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
180 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
181 "", \
182 PL_XPosix_ptrs[_CC_WORDCHAR], \
df38da56 183 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
c7304fe2
KW
184
185#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
186 STMT_START { \
187 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
188 "_X_regular_begin", \
2a16ac92 189 NULL, \
df38da56 190 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
c7304fe2
KW
191 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
192 "_X_extend", \
2a16ac92 193 NULL, \
c7304fe2
KW
194 COMBINING_GRAVE_ACCENT_UTF8); \
195 } STMT_END
d1eb3177 196
c7304fe2 197#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
198/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
199
5f80c4cf 200/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
201/* it would be nice to rework regcomp.sym to generate this stuff. sigh
202 *
203 * NOTE that *nothing* that affects backtracking should be in here, specifically
204 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
205 * node that is in between two EXACT like nodes when ascertaining what the required
206 * "follow" character is. This should probably be moved to regex compile time
207 * although it may be done at run time beause of the REF possibility - more
208 * investigation required. -- demerphq
209*/
baa60164
KW
210#define JUMPABLE(rn) ( \
211 OP(rn) == OPEN || \
3e901dc0 212 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
213 OP(rn) == EVAL || \
214 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
215 OP(rn) == PLUS || OP(rn) == MINMOD || \
216 OP(rn) == KEEPS || \
217 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 218)
ee9b8eae 219#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 220
ee9b8eae
YO
221#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
222
223#if 0
224/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
225 we don't need this definition. */
226#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 227#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
228#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
229
230#else
231/* ... so we use this as its faster. */
232#define IS_TEXT(rn) ( OP(rn)==EXACT )
098b07d5 233#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
234#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
235#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
236
237#endif
e2d8ce26 238
a84d97b6
HS
239/*
240 Search for mandatory following text node; for lookahead, the text must
241 follow but for lookbehind (rn->flags != 0) we skip to the next step.
242*/
baa60164 243#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
244 while (JUMPABLE(rn)) { \
245 const OPCODE type = OP(rn); \
246 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 247 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 248 else if (type == PLUS) \
cca55fe3 249 rn = NEXTOPER(rn); \
3dab1dad 250 else if (type == IFMATCH) \
a84d97b6 251 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 252 else rn += NEXT_OFF(rn); \
3dab1dad 253 } \
5f80c4cf 254} STMT_END
74750237 255
22913b96
KW
256/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
257 * These are for the pre-composed Hangul syllables, which are all in a
258 * contiguous block and arranged there in such a way so as to facilitate
259 * alorithmic determination of their characteristics. As such, they don't need
260 * a swash, but can be determined by simple arithmetic. Almost all are
261 * GCB=LVT, but every 28th one is a GCB=LV */
262#define SBASE 0xAC00 /* Start of block */
263#define SCount 11172 /* Length of block */
264#define TCount 28
c476f425 265
006f26b2
DM
266#define SLAB_FIRST(s) (&(s)->states[0])
267#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
268
a75351a1 269static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 270static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 271static regmatch_state * S_push_slab(pTHX);
51371543 272
87c0511b 273#define REGCP_PAREN_ELEMS 3
f067efbf 274#define REGCP_OTHER_ELEMS 3
e0fa7e2b 275#define REGCP_FRAME_ELEMS 1
620d5b66
NC
276/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
277 * are needed for the regexp context stack bookkeeping. */
278
76e3520e 279STATIC CHECKPOINT
92da3157 280S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 281{
97aff369 282 dVAR;
a3b680e6 283 const int retval = PL_savestack_ix;
92da3157
DM
284 const int paren_elems_to_push =
285 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
286 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
287 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 288 I32 p;
40a82448 289 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 290
b93070ed
DM
291 PERL_ARGS_ASSERT_REGCPPUSH;
292
e49a9654 293 if (paren_elems_to_push < 0)
d1b2014a
YO
294 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i",
295 paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS);
e49a9654 296
e0fa7e2b
NC
297 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
298 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 299 " out of range (%lu-%ld)",
92da3157
DM
300 total_elems,
301 (unsigned long)maxopenparen,
302 (long)parenfloor);
e0fa7e2b 303
620d5b66 304 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 305
495f47a5 306 DEBUG_BUFFERS_r(
92da3157 307 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
308 PerlIO_printf(Perl_debug_log,
309 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
310 PTR2UV(rex),
311 PTR2UV(rex->offs)
312 );
313 );
92da3157 314 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 315/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
316 SSPUSHIV(rex->offs[p].end);
317 SSPUSHIV(rex->offs[p].start);
1ca2007e 318 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 319 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
320 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
321 (UV)p,
322 (IV)rex->offs[p].start,
323 (IV)rex->offs[p].start_tmp,
324 (IV)rex->offs[p].end
40a82448 325 ));
a0d0e21e 326 }
b1ce53c5 327/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 328 SSPUSHINT(maxopenparen);
b93070ed
DM
329 SSPUSHINT(rex->lastparen);
330 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 331 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 332
a0d0e21e
LW
333 return retval;
334}
335
c277df42 336/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
337#define REGCP_SET(cp) \
338 DEBUG_STATE_r( \
ab3bbdeb 339 PerlIO_printf(Perl_debug_log, \
e4f74956 340 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
341 (IV)PL_savestack_ix)); \
342 cp = PL_savestack_ix
c3464db5 343
ab3bbdeb 344#define REGCP_UNWIND(cp) \
e4f74956 345 DEBUG_STATE_r( \
ab3bbdeb 346 if (cp != PL_savestack_ix) \
e4f74956
YO
347 PerlIO_printf(Perl_debug_log, \
348 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
349 (IV)(cp), (IV)PL_savestack_ix)); \
350 regcpblow(cp)
c277df42 351
a8d1f4b4
DM
352#define UNWIND_PAREN(lp, lcp) \
353 for (n = rex->lastparen; n > lp; n--) \
354 rex->offs[n].end = -1; \
355 rex->lastparen = n; \
356 rex->lastcloseparen = lcp;
357
358
f067efbf 359STATIC void
92da3157 360S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 361{
97aff369 362 dVAR;
e0fa7e2b 363 UV i;
87c0511b 364 U32 paren;
a3621e74
YO
365 GET_RE_DEBUG_FLAGS_DECL;
366
7918f24d
NC
367 PERL_ARGS_ASSERT_REGCPPOP;
368
b1ce53c5 369 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 370 i = SSPOPUV;
e0fa7e2b
NC
371 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
372 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
373 rex->lastcloseparen = SSPOPINT;
374 rex->lastparen = SSPOPINT;
92da3157 375 *maxopenparen_p = SSPOPINT;
b1ce53c5 376
620d5b66 377 i -= REGCP_OTHER_ELEMS;
b1ce53c5 378 /* Now restore the parentheses context. */
495f47a5
DM
379 DEBUG_BUFFERS_r(
380 if (i || rex->lastparen + 1 <= rex->nparens)
381 PerlIO_printf(Perl_debug_log,
382 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
383 PTR2UV(rex),
384 PTR2UV(rex->offs)
385 );
386 );
92da3157 387 paren = *maxopenparen_p;
620d5b66 388 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 389 SSize_t tmps;
1ca2007e 390 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
391 rex->offs[paren].start = SSPOPIV;
392 tmps = SSPOPIV;
b93070ed
DM
393 if (paren <= rex->lastparen)
394 rex->offs[paren].end = tmps;
495f47a5
DM
395 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
396 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
397 (UV)paren,
398 (IV)rex->offs[paren].start,
399 (IV)rex->offs[paren].start_tmp,
400 (IV)rex->offs[paren].end,
401 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 402 );
87c0511b 403 paren--;
a0d0e21e 404 }
daf18116 405#if 1
dafc8851
JH
406 /* It would seem that the similar code in regtry()
407 * already takes care of this, and in fact it is in
408 * a better location to since this code can #if 0-ed out
409 * but the code in regtry() is needed or otherwise tests
410 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
411 * (as of patchlevel 7877) will fail. Then again,
412 * this code seems to be necessary or otherwise
225593e1
DM
413 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
414 * --jhi updated by dapm */
b93070ed 415 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 416 if (i > *maxopenparen_p)
b93070ed
DM
417 rex->offs[i].start = -1;
418 rex->offs[i].end = -1;
495f47a5
DM
419 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
420 " \\%"UVuf": %s ..-1 undeffing\n",
421 (UV)i,
92da3157 422 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 423 ));
a0d0e21e 424 }
dafc8851 425#endif
a0d0e21e
LW
426}
427
74088413
DM
428/* restore the parens and associated vars at savestack position ix,
429 * but without popping the stack */
430
431STATIC void
92da3157 432S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
433{
434 I32 tmpix = PL_savestack_ix;
435 PL_savestack_ix = ix;
92da3157 436 regcppop(rex, maxopenparen_p);
74088413
DM
437 PL_savestack_ix = tmpix;
438}
439
02db2b7b 440#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 441
31c7f561
KW
442STATIC bool
443S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
444{
445 /* Returns a boolean as to whether or not 'character' is a member of the
446 * Posix character class given by 'classnum' that should be equivalent to a
447 * value in the typedef '_char_class_number'.
448 *
449 * Ideally this could be replaced by a just an array of function pointers
450 * to the C library functions that implement the macros this calls.
451 * However, to compile, the precise function signatures are required, and
452 * these may vary from platform to to platform. To avoid having to figure
453 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
454 * which adds an extra layer of function call overhead (unless the C
455 * optimizer strips it away). But we don't particularly care about
456 * performance with locales anyway. */
31c7f561
KW
457
458 switch ((_char_class_number) classnum) {
15861f94 459 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 460 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
461 case _CC_ENUM_ASCII: return isASCII_LC(character);
462 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
463 case _CC_ENUM_CASED: return isLOWER_LC(character)
464 || isUPPER_LC(character);
e8d596e0 465 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
466 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
467 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
468 case _CC_ENUM_LOWER: return isLOWER_LC(character);
469 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 470 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 471 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 472 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
473 case _CC_ENUM_UPPER: return isUPPER_LC(character);
474 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 475 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
476 default: /* VERTSPACE should never occur in locales */
477 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
478 }
479
480 assert(0); /* NOTREACHED */
481 return FALSE;
482}
483
3018b823
KW
484STATIC bool
485S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
486{
487 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
488 * 'character' is a member of the Posix character class given by 'classnum'
489 * that should be equivalent to a value in the typedef
490 * '_char_class_number'.
491 *
492 * This just calls isFOO_lc on the code point for the character if it is in
493 * the range 0-255. Outside that range, all characters avoid Unicode
494 * rules, ignoring any locale. So use the Unicode function if this class
495 * requires a swash, and use the Unicode macro otherwise. */
496
497 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
498
499 if (UTF8_IS_INVARIANT(*character)) {
500 return isFOO_lc(classnum, *character);
501 }
502 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
503 return isFOO_lc(classnum,
94bb8c36 504 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
505 }
506
507 if (classnum < _FIRST_NON_SWASH_CC) {
508
509 /* Initialize the swash unless done already */
510 if (! PL_utf8_swash_ptrs[classnum]) {
511 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
512 PL_utf8_swash_ptrs[classnum] =
513 _core_swash_init("utf8",
514 "",
515 &PL_sv_undef, 1, 0,
516 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
517 }
518
92a2046b
KW
519 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
520 character,
521 TRUE /* is UTF */ ));
3018b823
KW
522 }
523
524 switch ((_char_class_number) classnum) {
525 case _CC_ENUM_SPACE:
526 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
527
528 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
529 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
530 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
531 default: return 0; /* Things like CNTRL are always
532 below 256 */
533 }
534
535 assert(0); /* NOTREACHED */
536 return FALSE;
537}
538
a687059c 539/*
e50aee73 540 * pregexec and friends
a687059c
LW
541 */
542
76234dfb 543#ifndef PERL_IN_XSUB_RE
a687059c 544/*
c277df42 545 - pregexec - match a regexp against a string
a687059c 546 */
c277df42 547I32
5aaab254 548Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 549 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
550/* stringarg: the point in the string at which to begin matching */
551/* strend: pointer to null at end of string */
552/* strbeg: real beginning of string */
553/* minend: end of match must be >= minend bytes after stringarg. */
554/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
555 * itself is accessed via the pointers above */
556/* nosave: For optimizations. */
c277df42 557{
7918f24d
NC
558 PERL_ARGS_ASSERT_PREGEXEC;
559
c277df42 560 return
9041c2e3 561 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
562 nosave ? 0 : REXEC_COPY_STR);
563}
76234dfb 564#endif
22e551b9 565
9041c2e3 566/*
cad2e5aa
JH
567 * Need to implement the following flags for reg_anch:
568 *
569 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
570 * USE_INTUIT_ML
571 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
572 * INTUIT_AUTORITATIVE_ML
573 * INTUIT_ONCE_NOML - Intuit can match in one location only.
574 * INTUIT_ONCE_ML
575 *
576 * Another flag for this function: SECOND_TIME (so that float substrs
577 * with giant delta may be not rechecked).
578 */
579
3f7c398e 580/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
581 Otherwise, only SvCUR(sv) is used to get strbeg. */
582
6eb5f6b9
JH
583/* XXXX Some places assume that there is a fixed substring.
584 An update may be needed if optimizer marks as "INTUITable"
585 RExen without fixed substrings. Similarly, it is assumed that
586 lengths of all the strings are no more than minlen, thus they
587 cannot come from lookahead.
40d049e4
YO
588 (Or minlen should take into account lookahead.)
589 NOTE: Some of this comment is not correct. minlen does now take account
590 of lookahead/behind. Further research is required. -- demerphq
591
592*/
6eb5f6b9 593
2c2d71f5
JH
594/* A failure to find a constant substring means that there is no need to make
595 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 596 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
597 regtry() should be needed.
598
599 REx compiler's optimizer found 4 possible hints:
600 a) Anchored substring;
601 b) Fixed substring;
602 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 603 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 604 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
605 string which does not contradict any of them.
606 */
2c2d71f5 607
6eb5f6b9
JH
608/* Most of decisions we do here should have been done at compile time.
609 The nodes of the REx which we used for the search should have been
610 deleted from the finite automaton. */
611
52a21eb3
DM
612/* args:
613 * rx: the regex to match against
614 * sv: the SV being matched: only used for utf8 flag; the string
615 * itself is accessed via the pointers below. Note that on
616 * something like an overloaded SV, SvPOK(sv) may be false
617 * and the string pointers may point to something unrelated to
618 * the SV itself.
619 * strbeg: real beginning of string
620 * strpos: the point in the string at which to begin matching
621 * strend: pointer to the byte following the last char of the string
622 * flags currently unused; set to 0
623 * data: currently unused; set to NULL
624 */
625
cad2e5aa 626char *
52a21eb3
DM
627Perl_re_intuit_start(pTHX_
628 REGEXP * const rx,
629 SV *sv,
630 const char * const strbeg,
631 char *strpos,
632 char *strend,
633 const U32 flags,
634 re_scream_pos_data *data)
cad2e5aa 635{
97aff369 636 dVAR;
8d919b0a 637 struct regexp *const prog = ReANY(rx);
49f55535 638 SSize_t start_shift = 0;
cad2e5aa 639 /* Should be nonnegative! */
ea3daa5d 640 SSize_t end_shift = 0;
0fc004dd
DM
641 /* current lowest pos in string where the regex can start matching */
642 char *rx_origin = strpos;
eb578fdb 643 SV *check;
f2ed9b32 644 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 645 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 646 bool ml_anch = 0;
8f4bf5fc 647 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 648 char *check_at = NULL; /* check substr found at this pos */
d8080198 649 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 650 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 651 RXi_GET_DECL(prog,progi);
02d5137b
DM
652 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
653 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
654 GET_RE_DEBUG_FLAGS_DECL;
655
7918f24d 656 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
657 PERL_UNUSED_ARG(flags);
658 PERL_UNUSED_ARG(data);
7918f24d 659
1dc475d0
DM
660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
661 "Intuit: trying to determine minimum start position...\n"));
662
fb9bbddb
DM
663 /* for now, assume that all substr offsets are positive. If at some point
664 * in the future someone wants to do clever things with look-behind and
665 * -ve offsets, they'll need to fix up any code in this function
666 * which uses these offsets. See the thread beginning
667 * <20140113145929.GF27210@iabyn.com>
668 */
669 assert(prog->substrs->data[0].min_offset >= 0);
670 assert(prog->substrs->data[0].max_offset >= 0);
671 assert(prog->substrs->data[1].min_offset >= 0);
672 assert(prog->substrs->data[1].max_offset >= 0);
673 assert(prog->substrs->data[2].min_offset >= 0);
674 assert(prog->substrs->data[2].max_offset >= 0);
675
f7022b5a
DM
676 /* for now, assume that if both present, that the floating substring
677 * follows the anchored substring, and that they don't overlap.
678 * If you break this assumption (e.g. doing better optimisations
679 * with lookahead/behind), then you'll need to audit the code in this
680 * function carefully first
681 */
682 assert(
683 ! ( (prog->anchored_utf8 || prog->anchored_substr)
684 && (prog->float_utf8 || prog->float_substr))
685 || (prog->float_min_offset >= prog->anchored_offset));
686
c344f387
JH
687 /* CHR_DIST() would be more correct here but it makes things slow. */
688 if (prog->minlen > strend - strpos) {
a3621e74 689 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 690 " String too short...\n"));
cad2e5aa 691 goto fail;
2c2d71f5 692 }
d8da0584 693
6c3fea77 694 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 695 reginfo->info_aux = NULL;
9d9163fb 696 reginfo->strbeg = strbeg;
220db18a 697 reginfo->strend = strend;
aed7b151 698 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 699 reginfo->intuit = 1;
1cb48e53
DM
700 /* not actually used within intuit, but zero for safety anyway */
701 reginfo->poscache_maxiter = 0;
02d5137b 702
f2ed9b32 703 if (utf8_target) {
33b8afdf
JH
704 if (!prog->check_utf8 && prog->check_substr)
705 to_utf8_substr(prog);
706 check = prog->check_utf8;
707 } else {
7e0d5ad7
KW
708 if (!prog->check_substr && prog->check_utf8) {
709 if (! to_byte_substr(prog)) {
6b54ddc5 710 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
711 }
712 }
33b8afdf
JH
713 check = prog->check_substr;
714 }
274cd312 715
1dc475d0
DM
716 /* dump the various substring data */
717 DEBUG_OPTIMISE_MORE_r({
718 int i;
719 for (i=0; i<=2; i++) {
720 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
721 : prog->substrs->data[i].substr);
722 if (!sv)
723 continue;
724
725 PerlIO_printf(Perl_debug_log,
726 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
727 " useful=%"IVdf" utf8=%d [%s]\n",
728 i,
729 (IV)prog->substrs->data[i].min_offset,
730 (IV)prog->substrs->data[i].max_offset,
731 (IV)prog->substrs->data[i].end_shift,
732 BmUSEFUL(sv),
733 utf8_target ? 1 : 0,
734 SvPEEK(sv));
735 }
736 });
737
8e1490ee 738 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
274cd312
DM
739 /* Check after \n? */
740 ml_anch = ( (prog->intflags & PREGf_ANCH_MBOL)
741 || ((prog->intflags & PREGf_ANCH_BOL) && multiline));
cad2e5aa 742
7e25d62c 743 if (!ml_anch) {
c889ccc8
DM
744 /* we are only allowed to match at BOS or \G */
745
57fcbfa7
DM
746 /* trivially reject if there's a BOS anchor and we're not at BOS.
747 * In the case of \G, we hope(!) that the caller has already
748 * set strpos to pos()-gofs, and will already have checked
749 * that this anchor position is legal. So we can skip it here.
750 */
751 if ( !(prog->intflags & PREGf_ANCH_GPOS)
752 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
753 && (strpos != strbeg))
c889ccc8
DM
754 {
755 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 756 " Not at start...\n"));
c889ccc8
DM
757 goto fail;
758 }
759
a5d12a4b
DM
760 /* in the presence of an anchor, the anchored (relative to the
761 * start of the regex) substr must also be anchored relative
762 * to strpos. So quickly reject if substr isn't found there */
763
c889ccc8
DM
764 if (prog->check_offset_min == prog->check_offset_max
765 && !(prog->intflags & PREGf_CANY_SEEN)
766 && ! multiline) /* /m can cause \n's to match that aren't
767 accounted for in the string max length.
768 See [perl #115242] */
769 {
770 /* Substring at constant offset from beg-of-str... */
d307bf57 771 SSize_t slen = SvCUR(check);
7ef63983 772 char *s;
c889ccc8
DM
773
774 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 775
1dc475d0
DM
776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
777 " Looking for check substr at fixed offset %"IVdf"...\n",
778 (IV)prog->check_offset_min));
779
c889ccc8 780 if (SvTAIL(check)) {
d307bf57
DM
781 /* In this case, the regex is anchored at the end too,
782 * so the lengths must match exactly, give or take a \n.
783 * NB: slen >= 1 since the last char of check is \n */
c889ccc8
DM
784 if ( strend - s > slen || strend - s < slen - 1
785 || (strend - s == slen && strend[-1] != '\n'))
786 {
787 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 788 " String too long...\n"));
c889ccc8
DM
789 goto fail_finish;
790 }
791 /* Now should match s[0..slen-2] */
792 slen--;
c889ccc8 793 }
d307bf57
DM
794 if (slen && (*SvPVX_const(check) != *s
795 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
796 {
797 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 798 " String not equal...\n"));
d307bf57
DM
799 goto fail_finish;
800 }
c889ccc8
DM
801
802 check_at = s;
803 goto success_at_start;
cad2e5aa 804 }
cad2e5aa 805 }
cad2e5aa 806 }
0fc004dd 807
c0e0ec46
DM
808 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
809 end_shift = prog->check_end_shift;
cad2e5aa 810
19188028 811#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 812 if (end_shift < 0)
1de06328 813 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 814 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
815#endif
816
2c2d71f5 817 restart:
0fc004dd
DM
818 /* Find a candidate regex origin in the region rx_origin..strend
819 * by looking for the "check" substring in that region, corrected by
820 * start/end_shift.
821 */
1de06328
YO
822
823 {
c33e64f0
FC
824 U8* start_point;
825 U8* end_point;
c889ccc8 826
c889ccc8 827 DEBUG_OPTIMISE_MORE_r({
1dc475d0 828 PerlIO_printf(Perl_debug_log,
ae5d4331 829 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 830 " Start shift: %"IVdf" End shift %"IVdf
4d281893 831 " Real end Shift: %"IVdf"\n",
197641b6 832 (IV)(rx_origin - strpos),
c889ccc8 833 (IV)prog->check_offset_min,
12fbc530
DM
834 (IV)start_shift,
835 (IV)end_shift,
c889ccc8
DM
836 (IV)prog->check_end_shift);
837 });
1de06328 838
0d331aaf 839 if (prog->intflags & PREGf_CANY_SEEN) {
0fc004dd 840 start_point= (U8*)(rx_origin + start_shift);
12fbc530 841 end_point= (U8*)(strend - end_shift);
1de06328 842 } else {
0fc004dd 843 start_point= HOP3(rx_origin, start_shift, strend);
12fbc530 844 end_point= HOP3(strend, -end_shift, strbeg);
1de06328 845 }
c889ccc8 846
77656d5b
DM
847 /* if the regex is absolutely anchored to the start of the string,
848 * then check_offset_max represents an upper bound on the string
849 * where the substr could start */
c19c836a
DM
850 if (!ml_anch
851 && prog->intflags & PREGf_ANCH
77656d5b
DM
852 && prog->check_offset_max != SSize_t_MAX
853 && start_shift < prog->check_offset_max)
c19c836a 854 {
1a08ba3a
DM
855 SSize_t len = SvCUR(check) - !!SvTAIL(check);
856 end_point = HOP3lim(start_point,
857 prog->check_offset_max - start_shift,
858 end_point -len)
859 + len;
d6ef1678
DM
860 }
861
6bda09f9 862 DEBUG_OPTIMISE_MORE_r({
1dc475d0 863 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
1de06328 864 (int)(end_point - start_point),
fc8cd66c 865 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
866 start_point);
867 });
868
ae5d4331 869 check_at = fbm_instr( start_point, end_point,
7fba1cd6 870 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 871 }
c889ccc8 872
cad2e5aa
JH
873 /* Update the count-of-usability, remove useless subpatterns,
874 unshift s. */
2c2d71f5 875
ab3bbdeb 876 DEBUG_EXECUTE_r({
f2ed9b32 877 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb 878 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1dc475d0 879 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
ae5d4331 880 (check_at ? "Found" : "Did not find"),
f2ed9b32 881 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
882 ? "anchored" : "floating"),
883 quoted,
884 RE_SV_TAIL(check),
ae5d4331 885 (check_at ? " at offset " : "...\n") );
ab3bbdeb 886 });
2c2d71f5 887
ae5d4331 888 if (!check_at)
2c2d71f5 889 goto fail_finish;
2c2d71f5 890 /* Finish the diagnostic message */
197641b6 891 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
2c2d71f5 892
fdc003fd 893 /* set rx_origin to the minimum position where the regex could start
b96863b8
DM
894 * matching, given the constraint of the just-matched check substring.
895 * But don't set it lower than previously.
fdc003fd
DM
896 */
897
b96863b8
DM
898 if (check_at - rx_origin > prog->check_offset_max)
899 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
fdc003fd
DM
900
901
1de06328
YO
902 /* XXX dmq: first branch is for positive lookbehind...
903 Our check string is offset from the beginning of the pattern.
904 So we need to do any stclass tests offset forward from that
905 point. I think. :-(
906 */
907
2c2d71f5
JH
908 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
909 Start with the other substr.
910 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 911 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
912 *always* match. Probably should be marked during compile...
913 Probably it is right to do no SCREAM here...
914 */
915
6480a6c4
DM
916 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
917 : prog->substrs->data[other_ix].substr)
1de06328 918 {
30944b6d 919 /* Take into account the "other" substring. */
6c3343a6
DM
920 char *last, *last1;
921 char *s;
922 SV* must;
923 struct reg_substr_datum *other;
924
925 do_other_substr:
926 other = &prog->substrs->data[other_ix];
927
928 /* if "other" is anchored:
929 * we've previously found a floating substr starting at check_at.
930 * This means that the regex origin must lie somewhere
931 * between min (rx_origin): HOP3(check_at, -check_offset_max)
932 * and max: HOP3(check_at, -check_offset_min)
933 * (except that min will be >= strpos)
934 * So the fixed substr must lie somewhere between
935 * HOP3(min, anchored_offset)
936 * HOP3(max, anchored_offset) + SvCUR(substr)
937 */
938
939 /* if "other" is floating
940 * Calculate last1, the absolute latest point where the
941 * floating substr could start in the string, ignoring any
942 * constraints from the earlier fixed match. It is calculated
943 * as follows:
944 *
945 * strend - prog->minlen (in chars) is the absolute latest
946 * position within the string where the origin of the regex
947 * could appear. The latest start point for the floating
948 * substr is float_min_offset(*) on from the start of the
949 * regex. last1 simply combines thee two offsets.
950 *
951 * (*) You might think the latest start point should be
952 * float_max_offset from the regex origin, and technically
953 * you'd be correct. However, consider
954 * /a\d{2,4}bcd\w/
955 * Here, float min, max are 3,5 and minlen is 7.
956 * This can match either
957 * /a\d\dbcd\w/
958 * /a\d\d\dbcd\w/
959 * /a\d\d\d\dbcd\w/
960 * In the first case, the regex matches minlen chars; in the
961 * second, minlen+1, in the third, minlen+2.
962 * In the first case, the floating offset is 3 (which equals
963 * float_min), in the second, 4, and in the third, 5 (which
964 * equals float_max). In all cases, the floating string bcd
965 * can never start more than 4 chars from the end of the
966 * string, which equals minlen - float_min. As the substring
967 * starts to match more than float_min from the start of the
968 * regex, it makes the regex match more than minlen chars,
969 * and the two cancel each other out. So we can always use
970 * float_min - minlen, rather than float_max - minlen for the
971 * latest position in the string.
972 *
973 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
974 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
975 */
976
e7a14a9c 977 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
978 last1 = HOP3c(strend,
979 other->min_offset - prog->minlen, strbeg);
980
4d006249 981 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
982 /* last is the latest point where the floating substr could
983 * start, *given* any constraints from the earlier fixed
984 * match. This constraint is that the floating string starts
985 * <= float_max_offset chars from the regex origin (rx_origin).
986 * If this value is less than last1, use it instead.
eb3831ce 987 */
6c3343a6
DM
988 assert(rx_origin <= last1);
989 last =
990 /* this condition handles the offset==infinity case, and
991 * is a short-cut otherwise. Although it's comparing a
992 * byte offset to a char length, it does so in a safe way,
993 * since 1 char always occupies 1 or more bytes,
994 * so if a string range is (last1 - rx_origin) bytes,
995 * it will be less than or equal to (last1 - rx_origin)
996 * chars; meaning it errs towards doing the accurate HOP3
997 * rather than just using last1 as a short-cut */
998 (last1 - rx_origin) < other->max_offset
999 ? last1
1000 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1001 }
1002 else {
1003 assert(strpos + start_shift <= check_at);
1004 last = HOP4c(check_at, other->min_offset - start_shift,
1005 strbeg, strend);
1006 }
ead917d0 1007
6c3343a6
DM
1008 s = HOP3c(rx_origin, other->min_offset, strend);
1009 if (s < other_last) /* These positions already checked */
1010 s = other_last;
1011
1012 must = utf8_target ? other->utf8_substr : other->substr;
1013 assert(SvPOK(must));
1014 s = fbm_instr(
1015 (unsigned char*)s,
1016 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1017 must,
1018 multiline ? FBMrf_MULTILINE : 0
1019 );
1020 DEBUG_EXECUTE_r({
1021 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1022 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1023 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1024 s ? "Found" : "Contradicts",
1025 other_ix ? "floating" : "anchored",
1026 quoted, RE_SV_TAIL(must));
1027 });
ead917d0 1028
ead917d0 1029
6c3343a6
DM
1030 if (!s) {
1031 /* last1 is latest possible substr location. If we didn't
1032 * find it before there, we never will */
1033 if (last >= last1) {
1034 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1035 ", giving up...\n"));
1036 goto fail_finish;
ead917d0
DM
1037 }
1038
6c3343a6
DM
1039 /* try to find the check substr again at a later
1040 * position. Maybe next time we'll find the "other" substr
1041 * in range too */
1042 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1043 ", trying %s at offset %ld...\n",
1044 (other_ix ? "floating" : "anchored"),
197641b6 1045 (long)(HOP3c(check_at, 1, strend) - strpos)));
6c3343a6
DM
1046
1047 other_last = HOP3c(last, 1, strend) /* highest failure */;
1048 rx_origin =
4d006249 1049 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1050 ? HOP3c(rx_origin, 1, strend)
1051 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1052 goto restart;
1053 }
1054 else {
1055 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
197641b6 1056 (long)(s - strpos)));
ead917d0 1057
4d006249 1058 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1059 /* other_last is set to s, not s+1, since its possible for
1060 * a floating substr to fail first time, then succeed
1061 * second time at the same floating position; e.g.:
1062 * "-AB--AABZ" =~ /\wAB\d*Z/
1063 * The first time round, anchored and float match at
1064 * "-(AB)--AAB(Z)" then fail on the initial \w character
1065 * class. Second time round, they match at "-AB--A(AB)(Z)".
1066 */
1067 other_last = s;
ead917d0
DM
1068 }
1069 else {
6c3343a6
DM
1070 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1071 other_last = HOP3c(s, 1, strend);
ead917d0 1072 }
6c3343a6 1073 }
cad2e5aa 1074 }
acba93e8
DM
1075 else {
1076 DEBUG_OPTIMISE_MORE_r(
1077 PerlIO_printf(Perl_debug_log,
1078 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1079 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
acba93e8
DM
1080 " strend-strpos:%"IVdf"\n",
1081 (IV)prog->check_offset_min,
1082 (IV)prog->check_offset_max,
1c1c599d 1083 (IV)(check_at-strpos),
acba93e8 1084 (IV)(rx_origin-strpos),
1c1c599d 1085 (IV)(rx_origin-check_at),
acba93e8
DM
1086 (IV)(strend-strpos)
1087 )
1088 );
1089 }
2c2d71f5 1090
acba93e8 1091 postprocess_substr_matches:
0991020e 1092
e3c6feb0
DM
1093 /* handle the extra constraint of /^/m */
1094
a71a26b9
DM
1095 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n'
1096 /* May be due to an implicit anchor of m{.*foo} */
1097 && !(prog->intflags & PREGf_IMPLICIT))
e3c6feb0 1098 {
4620cb61
DM
1099 char *s;
1100
a62659bd
DM
1101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1102 " looking for /^/m anchor"));
d0880ea7
DM
1103
1104 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1105 * Find the next \n, if any, even if it's beyond the current
1106 * anchored and/or floating substrings. Whether we should be
1107 * scanning ahead for the next \n or the next substr is debatable.
1108 * On the one hand you'd expect rare substrings to appear less
1109 * often than \n's. On the other hand, searching for \n means
1110 * we're effectively flipping been check_substr and "\n" on each
1111 * iteration as the current "rarest" string candidate, which
1112 * means for example that we'll quickly reject the whole string if
1113 * hasn't got a \n, rather than trying every substr position
1114 * first
1115 */
d0880ea7 1116
4620cb61
DM
1117 s = HOP3c(strend, - prog->minlen, strpos);
1118 if (s <= rx_origin ||
1119 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1120 {
d0880ea7
DM
1121 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1122 " Did not find /%s^%s/m...\n",
1123 PL_colors[0], PL_colors[1]));
a62659bd
DM
1124 goto fail_finish;
1125 }
d0880ea7 1126
4ada1233
DM
1127 /* earliest possible origin is 1 char after the \n.
1128 * (since *rx_origin == '\n', it's safe to ++ here rather than
1129 * HOP(rx_origin, 1)) */
1130 rx_origin++;
d0880ea7 1131
f4f115de 1132 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1133 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1134 {
d0880ea7
DM
1135 /* Position contradicts check-string; either because
1136 * check was anchored (and thus has no wiggle room),
4ada1233 1137 * or check was float and rx_origin is above the float range */
d0880ea7
DM
1138 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1139 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
4ada1233 1140 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
d0880ea7
DM
1141 goto restart;
1142 }
1143
1144 /* if we get here, the check substr must have been float,
2e759faa 1145 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1146 * "other" substr which still contradicts */
1147 assert(prog->substrs->check_ix); /* check is float */
1148
1149 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1150 /* whoops, the anchored "other" substr exists, so we still
1151 * contradict. On the other hand, the float "check" substr
1152 * didn't contradict, so just retry the anchored "other"
1153 * substr */
1154 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1155 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1156 PL_colors[0], PL_colors[1],
1157 (long)(rx_origin - strpos),
1158 (long)(rx_origin - strpos + prog->anchored_offset)));
1159 goto do_other_substr;
1160 }
1161
1162 /* success: we don't contradict the found floating substring
1163 * (and there's no anchored substr). */
d0880ea7
DM
1164 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1165 " Found /%s^%s/m at offset %ld...\n",
1166 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
e3c6feb0
DM
1167 }
1168 else {
2e759faa
DM
1169 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1170 " Starting position does not contradict /%s^%s/m...\n",
1171 PL_colors[0], PL_colors[1]));
e3c6feb0
DM
1172 }
1173
1174
1175 /* Decide whether using the substrings helped */
1176
1177 if (rx_origin != strpos) {
1178 /* Fixed substring is found far enough so that the match
1179 cannot start at strpos. */
1180
1181 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
f2ed9b32 1182 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1183 }
1184 else {
f5952150 1185 /* The found string does not prohibit matching at strpos,
2c2d71f5 1186 - no optimization of calling REx engine can be performed,
f5952150
GS
1187 unless it was an MBOL and we are not after MBOL,
1188 or a future STCLASS check will fail this. */
2c2d71f5 1189 success_at_start:
bbe252da 1190 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1191 && (utf8_target ? (
33b8afdf
JH
1192 prog->check_utf8 /* Could be deleted already */
1193 && --BmUSEFUL(prog->check_utf8) < 0
1194 && (prog->check_utf8 == prog->float_utf8)
1195 ) : (
1196 prog->check_substr /* Could be deleted already */
1197 && --BmUSEFUL(prog->check_substr) < 0
1198 && (prog->check_substr == prog->float_substr)
1199 )))
66e933ab 1200 {
cad2e5aa 1201 /* If flags & SOMETHING - do not do it many times on the same match */
1dc475d0 1202 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
f2ed9b32
KW
1203 /* XXX Does the destruction order has to change with utf8_target? */
1204 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1205 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1206 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1207 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1208 check = NULL; /* abort */
486ec47a 1209 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951 1210 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
8e1490ee
YO
1211 if (prog->intflags & PREGf_IMPLICIT) {
1212 prog->intflags &= ~PREGf_ANCH_MBOL;
1213 /* maybe we have no anchors left after this... */
1214 if (!(prog->intflags & PREGf_ANCH))
1215 prog->extflags &= ~RXf_IS_ANCHORED;
1216 }
3cf5c195
IZ
1217 /* XXXX This is a remnant of the old implementation. It
1218 looks wasteful, since now INTUIT can use many
6eb5f6b9 1219 other heuristics. */
bbe252da 1220 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1221 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa 1222 }
cad2e5aa
JH
1223 }
1224
6eb5f6b9
JH
1225 /* Last resort... */
1226 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1227 /* trie stclasses are too expensive to use here, we are better off to
1228 leave it to regmatch itself */
f8fc2ecf 1229 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1230 /* minlen == 0 is possible if regstclass is \b or \B,
1231 and the fixed substr is ''$.
c765d6e0
DM
1232 Since minlen is already taken into account, rx_origin+1 is before strend;
1233 accidentally, minlen >= 1 guaranties no false positives at rx_origin + 1
6eb5f6b9
JH
1234 even for \b or \B. But (minlen? 1 : 0) below assumes that
1235 regstclass does not come from lookahead... */
1236 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1237 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf 1238 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1239
2c75e362 1240 /* XXX this value could be pre-computed */
f8fc2ecf 1241 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1242 ? (reginfo->is_utf8_pat
1243 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1244 : STR_LEN(progi->regstclass))
66e933ab 1245 : 1);
1de06328 1246 char * endpos;
fa3bb21d 1247 char *s;
1de06328 1248 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
fa3bb21d 1249 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1de06328
YO
1250 else if (prog->float_substr || prog->float_utf8)
1251 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1252 else
1253 endpos= strend;
1254
fa3bb21d
DM
1255 if (checked_upto < rx_origin)
1256 checked_upto = rx_origin;
1dc475d0
DM
1257 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1258 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
fa3bb21d 1259 " rx_origin: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1dc475d0 1260 (IV)start_shift, (IV)(check_at - strbeg),
fa3bb21d 1261 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg),
1dc475d0 1262 (IV)(checked_upto- strbeg)));
d8080198 1263
984e6dd1 1264 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
f9176b44 1265 reginfo);
d8080198
YO
1266 if (s) {
1267 checked_upto = s;
1268 } else {
6eb5f6b9 1269 if (endpos == strend) {
a3621e74 1270 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1271 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1272 goto fail;
1273 }
a3621e74 1274 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1275 " This position contradicts STCLASS...\n") );
8e1490ee 1276 if ((prog->intflags & PREGf_ANCH) && !ml_anch)
653099ff 1277 goto fail;
d8080198 1278 checked_upto = HOPBACKc(endpos, start_shift);
1dc475d0 1279 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
d8080198 1280 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
9fed8d02 1281
6eb5f6b9 1282 /* Contradict one of substrings */
97136c8a
DM
1283 if (prog->anchored_substr || prog->anchored_utf8) {
1284 if (prog->substrs->check_ix == 1) { /* check is float */
1285 /* Have both, check_string is floating */
1286 assert(rx_origin + start_shift <= check_at);
1287 if (rx_origin + start_shift != check_at) {
1288 /* not at latest position float substr could match:
1289 * Recheck anchored substring, but not floating... */
1290 if (!check) {
1291 rx_origin = NULL;
1292 goto giveup;
1293 }
1294 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1295 " Looking for anchored substr starting at offset %ld...\n",
1296 (long)(other_last - strpos)) );
1297 goto do_other_substr;
3369914b 1298 }
3369914b
DM
1299 }
1300 }
97136c8a 1301 else {
9fed8d02
DM
1302 /* float-only */
1303
1304 /* Another way we could have checked stclass at the
1305 current position only: */
1306 if (ml_anch) {
1307 rx_origin++;
1308 if (!check)
1309 goto giveup;
1310 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1311 " Looking for /%s^%s/m starting at offset %ld...\n",
1312 PL_colors[0], PL_colors[1],
1313 (long)(rx_origin - strpos)) );
1314 /* XXX DAPM I don't yet know why this is true, but the code
1315 * assumed it when it used to do goto try_at_offset */
1316 assert(rx_origin != strpos);
1317 goto postprocess_substr_matches;
ab60c45a 1318 }
9fed8d02
DM
1319 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1320 goto fail;
1321 /* Check is floating substring. */
1322 rx_origin = check_at - start_shift;
9fed8d02
DM
1323 }
1324
9fed8d02
DM
1325 rx_origin = HOP3c(rx_origin, 1, strend);
1326 if (rx_origin + start_shift + end_shift > strend) {
1327 /* XXXX Should be taken into account earlier? */
40268e5b 1328 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
9fed8d02
DM
1329 " Could not match STCLASS...\n") );
1330 goto fail;
1331 }
1332 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1333 " Looking for %s substr starting at offset %ld...\n",
1334 (prog->substrs->check_ix ? "floating" : "anchored"),
1335 (long)(rx_origin + start_shift - strpos)) );
1336 goto restart;
6eb5f6b9 1337 }
9fed8d02 1338
5f9c6575 1339 if (rx_origin != s) {
a3621e74 1340 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1341 " By STCLASS: moving %ld --> %ld\n",
5f9c6575 1342 (long)(rx_origin - strpos), (long)(s - strpos))
b7953727
JH
1343 );
1344 }
1345 else {
a3621e74 1346 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1347 " Does not contradict STCLASS...\n");
b7953727
JH
1348 );
1349 }
6eb5f6b9 1350 }
5e39e1e5 1351 giveup:
1dc475d0
DM
1352 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Intuit: %s%s:%s match at offset %ld\n",
1353 PL_colors[4], (check ? "Successfully guessed" : "Giving up"),
197641b6 1354 PL_colors[5], (long)(rx_origin - strpos)) );
c765d6e0 1355 return rx_origin;
2c2d71f5
JH
1356
1357 fail_finish: /* Substring not found */
33b8afdf 1358 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1359 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1360 fail:
a3621e74 1361 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1362 PL_colors[4], PL_colors[5]));
bd61b366 1363 return NULL;
cad2e5aa 1364}
9661b544 1365
a0a388a1 1366#define DECL_TRIE_TYPE(scan) \
098b07d5
KW
1367 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1368 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
fab2782b
YO
1369 trie_type = ((scan->flags == EXACT) \
1370 ? (utf8_target ? trie_utf8 : trie_plain) \
098b07d5
KW
1371 : (scan->flags == EXACTFA) \
1372 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1373 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
fab2782b 1374
fd3249ee 1375#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1376STMT_START { \
fab2782b 1377 STRLEN skiplen; \
baa60164 1378 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1379 switch (trie_type) { \
31f05a37 1380 case trie_utf8_exactfa_fold: \
baa60164
KW
1381 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1382 /* FALL THROUGH */ \
fab2782b
YO
1383 case trie_utf8_fold: \
1384 if ( foldlen>0 ) { \
c80e42f3 1385 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1386 foldlen -= len; \
1387 uscan += len; \
1388 len=0; \
1389 } else { \
445bf929 1390 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1391 len = UTF8SKIP(uc); \
1392 skiplen = UNISKIP( uvc ); \
1393 foldlen -= skiplen; \
1394 uscan = foldbuf + skiplen; \
1395 } \
1396 break; \
baa60164
KW
1397 case trie_latin_utf8_exactfa_fold: \
1398 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1399 /* FALL THROUGH */ \
fab2782b
YO
1400 case trie_latin_utf8_fold: \
1401 if ( foldlen>0 ) { \
c80e42f3 1402 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1403 foldlen -= len; \
1404 uscan += len; \
1405 len=0; \
1406 } else { \
1407 len = 1; \
31f05a37 1408 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1409 skiplen = UNISKIP( uvc ); \
1410 foldlen -= skiplen; \
1411 uscan = foldbuf + skiplen; \
1412 } \
1413 break; \
1414 case trie_utf8: \
c80e42f3 1415 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1416 break; \
1417 case trie_plain: \
1418 uvc = (UV)*uc; \
1419 len = 1; \
1420 } \
1421 if (uvc < 256) { \
1422 charid = trie->charmap[ uvc ]; \
1423 } \
1424 else { \
1425 charid = 0; \
1426 if (widecharmap) { \
1427 SV** const svpp = hv_fetch(widecharmap, \
1428 (char*)&uvc, sizeof(UV), 0); \
1429 if (svpp) \
1430 charid = (U16)SvIV(*svpp); \
1431 } \
1432 } \
4cadc6a9
YO
1433} STMT_END
1434
4cadc6a9
YO
1435#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1436STMT_START { \
1437 while (s <= e) { \
1438 if ( (CoNd) \
fac1af77 1439 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1440 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1441 goto got_it; \
1442 s++; \
1443 } \
1444} STMT_END
1445
1446#define REXEC_FBC_UTF8_SCAN(CoDe) \
1447STMT_START { \
9a902117 1448 while (s < strend) { \
4cadc6a9 1449 CoDe \
9a902117 1450 s += UTF8SKIP(s); \
4cadc6a9
YO
1451 } \
1452} STMT_END
1453
1454#define REXEC_FBC_SCAN(CoDe) \
1455STMT_START { \
1456 while (s < strend) { \
1457 CoDe \
1458 s++; \
1459 } \
1460} STMT_END
1461
1462#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1463REXEC_FBC_UTF8_SCAN( \
1464 if (CoNd) { \
02d5137b 1465 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1466 goto got_it; \
1467 else \
1468 tmp = doevery; \
1469 } \
1470 else \
1471 tmp = 1; \
1472)
1473
1474#define REXEC_FBC_CLASS_SCAN(CoNd) \
1475REXEC_FBC_SCAN( \
1476 if (CoNd) { \
02d5137b 1477 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1478 goto got_it; \
1479 else \
1480 tmp = doevery; \
1481 } \
1482 else \
1483 tmp = 1; \
1484)
1485
baa60164 1486#define REXEC_FBC_TRYIT \
02d5137b 1487if ((reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1488 goto got_it
1489
e1d1eefb 1490#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
baa60164 1491 if (utf8_target) { \
e1d1eefb
YO
1492 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1493 } \
1494 else { \
1495 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1496 }
e1d1eefb 1497
baa60164 1498#define DUMP_EXEC_POS(li,s,doutf8) \
9d9163fb 1499 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
6d59b646 1500 startpos, doutf8)
786e8c11 1501
cfaf538b 1502
baa60164 1503#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1504 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
cfaf538b
KW
1505 tmp = TEST_NON_UTF8(tmp); \
1506 REXEC_FBC_UTF8_SCAN( \
ce1d15d5 1507 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
cfaf538b
KW
1508 tmp = !tmp; \
1509 IF_SUCCESS; \
1510 } \
1511 else { \
1512 IF_FAIL; \
1513 } \
1514 ); \
1515
baa60164 1516#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1517 if (s == reginfo->strbeg) { \
cfaf538b
KW
1518 tmp = '\n'; \
1519 } \
1520 else { \
9d9163fb 1521 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
3db24e1e
KW
1522 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1523 0, UTF8_ALLOW_DEFAULT); \
cfaf538b
KW
1524 } \
1525 tmp = TeSt1_UtF8; \
3db24e1e 1526 LOAD_UTF8_CHARCLASS_ALNUM(); \
cfaf538b 1527 REXEC_FBC_UTF8_SCAN( \
3db24e1e 1528 if (tmp == ! (TeSt2_UtF8)) { \
cfaf538b
KW
1529 tmp = !tmp; \
1530 IF_SUCCESS; \
1531 } \
1532 else { \
1533 IF_FAIL; \
1534 } \
1535 ); \
1536
63ac0dad
KW
1537/* The only difference between the BOUND and NBOUND cases is that
1538 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1539 * NBOUND. This is accomplished by passing it in either the if or else clause,
1540 * with the other one being empty */
1541#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1542 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1543
1544#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1545 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1546
1547#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1548 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1549
1550#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1551 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1552
63ac0dad
KW
1553
1554/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1555 * be passed in completely with the variable name being tested, which isn't
1556 * such a clean interface, but this is easier to read than it was before. We
1557 * are looking for the boundary (or non-boundary between a word and non-word
1558 * character. The utf8 and non-utf8 cases have the same logic, but the details
1559 * must be different. Find the "wordness" of the character just prior to this
1560 * one, and compare it with the wordness of this one. If they differ, we have
1561 * a boundary. At the beginning of the string, pretend that the previous
1562 * character was a new-line */
baa60164 1563#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1564 if (utf8_target) { \
baa60164 1565 UTF8_CODE \
63ac0dad
KW
1566 } \
1567 else { /* Not utf8 */ \
9d9163fb 1568 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad
KW
1569 tmp = TEST_NON_UTF8(tmp); \
1570 REXEC_FBC_SCAN( \
1571 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1572 tmp = !tmp; \
1573 IF_SUCCESS; \
1574 } \
1575 else { \
1576 IF_FAIL; \
1577 } \
1578 ); \
1579 } \
baa60164 1580 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
63ac0dad
KW
1581 goto got_it;
1582
786e8c11 1583/* We know what class REx starts with. Try to find this position... */
02d5137b 1584/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1585/* annoyingly all the vars in this routine have different names from their counterparts
1586 in regmatch. /grrr */
1587
3c3eec57 1588STATIC char *
07be1b83 1589S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1590 const char *strend, regmatch_info *reginfo)
a687059c 1591{
73104a1b
KW
1592 dVAR;
1593 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1594 char *pat_string; /* The pattern's exactish string */
1595 char *pat_end; /* ptr to end char of pat_string */
1596 re_fold_t folder; /* Function for computing non-utf8 folds */
1597 const U8 *fold_array; /* array for folding ords < 256 */
1598 STRLEN ln;
1599 STRLEN lnc;
73104a1b
KW
1600 U8 c1;
1601 U8 c2;
1602 char *e;
1603 I32 tmp = 1; /* Scratch variable? */
ba44c216 1604 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1605 UV utf8_fold_flags = 0;
f9176b44 1606 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1607 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1608 with a result inverts that result, as 0^1 =
1609 1 and 1^1 = 0 */
1610 _char_class_number classnum;
1611
73104a1b 1612 RXi_GET_DECL(prog,progi);
2f7f8cb1 1613
73104a1b 1614 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1615
73104a1b
KW
1616 /* We know what class it must start with. */
1617 switch (OP(c)) {
1618 case ANYOF:
1619 if (utf8_target) {
1620 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1621 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1622 }
1623 else {
1624 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1625 }
1626 break;
1627 case CANY:
1628 REXEC_FBC_SCAN(
02d5137b 1629 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1630 goto got_it;
1631 else
1632 tmp = doevery;
1633 );
1634 break;
1635
098b07d5
KW
1636 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1637 assert(! is_utf8_pat);
1638 /* FALL THROUGH */
73104a1b 1639 case EXACTFA:
984e6dd1 1640 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1641 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1642 goto do_exactf_utf8;
1643 }
1644 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1645 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1646 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1647
2fdb7295
KW
1648 case EXACTF: /* This node only generated for non-utf8 patterns */
1649 assert(! is_utf8_pat);
73104a1b 1650 if (utf8_target) {
73104a1b
KW
1651 utf8_fold_flags = 0;
1652 goto do_exactf_utf8;
1653 }
1654 fold_array = PL_fold;
1655 folder = foldEQ;
1656 goto do_exactf_non_utf8;
1657
1658 case EXACTFL:
31f05a37 1659 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1660 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1661 goto do_exactf_utf8;
1662 }
1663 fold_array = PL_fold_locale;
1664 folder = foldEQ_locale;
1665 goto do_exactf_non_utf8;
3c760661 1666
73104a1b 1667 case EXACTFU_SS:
984e6dd1 1668 if (is_utf8_pat) {
73104a1b
KW
1669 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1670 }
1671 goto do_exactf_utf8;
16d951b7 1672
73104a1b 1673 case EXACTFU:
984e6dd1
DM
1674 if (is_utf8_pat || utf8_target) {
1675 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1676 goto do_exactf_utf8;
1677 }
fac1af77 1678
73104a1b
KW
1679 /* Any 'ss' in the pattern should have been replaced by regcomp,
1680 * so we don't have to worry here about this single special case
1681 * in the Latin1 range */
1682 fold_array = PL_fold_latin1;
1683 folder = foldEQ_latin1;
1684
1685 /* FALL THROUGH */
1686
1687 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1688 are no glitches with fold-length differences
1689 between the target string and pattern */
1690
1691 /* The idea in the non-utf8 EXACTF* cases is to first find the
1692 * first character of the EXACTF* node and then, if necessary,
1693 * case-insensitively compare the full text of the node. c1 is the
1694 * first character. c2 is its fold. This logic will not work for
1695 * Unicode semantics and the german sharp ss, which hence should
1696 * not be compiled into a node that gets here. */
1697 pat_string = STRING(c);
1698 ln = STR_LEN(c); /* length to match in octets/bytes */
1699
1700 /* We know that we have to match at least 'ln' bytes (which is the
1701 * same as characters, since not utf8). If we have to match 3
1702 * characters, and there are only 2 availabe, we know without
1703 * trying that it will fail; so don't start a match past the
1704 * required minimum number from the far end */
ea3daa5d 1705 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1706
02d5137b 1707 if (reginfo->intuit && e < s) {
73104a1b
KW
1708 e = s; /* Due to minlen logic of intuit() */
1709 }
fac1af77 1710
73104a1b
KW
1711 c1 = *pat_string;
1712 c2 = fold_array[c1];
1713 if (c1 == c2) { /* If char and fold are the same */
1714 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1715 }
1716 else {
1717 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1718 }
1719 break;
fac1af77 1720
73104a1b
KW
1721 do_exactf_utf8:
1722 {
1723 unsigned expansion;
1724
1725 /* If one of the operands is in utf8, we can't use the simpler folding
1726 * above, due to the fact that many different characters can have the
1727 * same fold, or portion of a fold, or different- length fold */
1728 pat_string = STRING(c);
1729 ln = STR_LEN(c); /* length to match in octets/bytes */
1730 pat_end = pat_string + ln;
984e6dd1 1731 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1732 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1733 : ln;
1734
1735 /* We have 'lnc' characters to match in the pattern, but because of
1736 * multi-character folding, each character in the target can match
1737 * up to 3 characters (Unicode guarantees it will never exceed
1738 * this) if it is utf8-encoded; and up to 2 if not (based on the
1739 * fact that the Latin 1 folds are already determined, and the
1740 * only multi-char fold in that range is the sharp-s folding to
1741 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1742 * string character. Adjust lnc accordingly, rounding up, so that
1743 * if we need to match at least 4+1/3 chars, that really is 5. */
1744 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1745 lnc = (lnc + expansion - 1) / expansion;
1746
1747 /* As in the non-UTF8 case, if we have to match 3 characters, and
1748 * only 2 are left, it's guaranteed to fail, so don't start a
1749 * match that would require us to go beyond the end of the string
1750 */
ea3daa5d 1751 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1752
02d5137b 1753 if (reginfo->intuit && e < s) {
73104a1b
KW
1754 e = s; /* Due to minlen logic of intuit() */
1755 }
0658cdde 1756
73104a1b
KW
1757 /* XXX Note that we could recalculate e to stop the loop earlier,
1758 * as the worst case expansion above will rarely be met, and as we
1759 * go along we would usually find that e moves further to the left.
1760 * This would happen only after we reached the point in the loop
1761 * where if there were no expansion we should fail. Unclear if
1762 * worth the expense */
1763
1764 while (s <= e) {
1765 char *my_strend= (char *)strend;
1766 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1767 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1768 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1769 {
1770 goto got_it;
1771 }
1772 s += (utf8_target) ? UTF8SKIP(s) : 1;
1773 }
1774 break;
1775 }
1776 case BOUNDL:
0eb30aeb 1777 FBC_BOUND(isWORDCHAR_LC,
f4cd282c 1778 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1779 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1780 break;
1781 case NBOUNDL:
0eb30aeb 1782 FBC_NBOUND(isWORDCHAR_LC,
f4cd282c 1783 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1784 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1785 break;
1786 case BOUND:
1787 FBC_BOUND(isWORDCHAR,
0eb30aeb 1788 isWORDCHAR_uni(tmp),
03940dc2 1789 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1790 break;
1791 case BOUNDA:
1792 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1793 isWORDCHAR_A(tmp),
1794 isWORDCHAR_A((U8*)s));
1795 break;
1796 case NBOUND:
1797 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1798 isWORDCHAR_uni(tmp),
03940dc2 1799 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1800 break;
1801 case NBOUNDA:
1802 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1803 isWORDCHAR_A(tmp),
1804 isWORDCHAR_A((U8*)s));
1805 break;
1806 case BOUNDU:
1807 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1808 isWORDCHAR_uni(tmp),
03940dc2 1809 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1810 break;
1811 case NBOUNDU:
1812 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1813 isWORDCHAR_uni(tmp),
03940dc2 1814 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1815 break;
73104a1b
KW
1816 case LNBREAK:
1817 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1818 is_LNBREAK_latin1_safe(s, strend)
1819 );
1820 break;
3018b823
KW
1821
1822 /* The argument to all the POSIX node types is the class number to pass to
1823 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1824
1825 case NPOSIXL:
1826 to_complement = 1;
1827 /* FALLTHROUGH */
1828
1829 case POSIXL:
3018b823
KW
1830 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1831 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1832 break;
3018b823
KW
1833
1834 case NPOSIXD:
1835 to_complement = 1;
1836 /* FALLTHROUGH */
1837
1838 case POSIXD:
1839 if (utf8_target) {
1840 goto posix_utf8;
1841 }
1842 goto posixa;
1843
1844 case NPOSIXA:
1845 if (utf8_target) {
1846 /* The complement of something that matches only ASCII matches all
1847 * UTF-8 variant code points, plus everything in ASCII that isn't
1848 * in the class */
1849 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1850 || ! _generic_isCC_A(*s, FLAGS(c)));
1851 break;
1852 }
1853
1854 to_complement = 1;
1855 /* FALLTHROUGH */
1856
73104a1b 1857 case POSIXA:
3018b823 1858 posixa:
73104a1b 1859 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1860 * byte invariant character. */
1861 REXEC_FBC_CLASS_SCAN(
1862 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1863 break;
3018b823
KW
1864
1865 case NPOSIXU:
1866 to_complement = 1;
1867 /* FALLTHROUGH */
1868
1869 case POSIXU:
1870 if (! utf8_target) {
1871 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1872 FLAGS(c))));
1873 }
1874 else {
1875
1876 posix_utf8:
1877 classnum = (_char_class_number) FLAGS(c);
1878 if (classnum < _FIRST_NON_SWASH_CC) {
1879 while (s < strend) {
1880
1881 /* We avoid loading in the swash as long as possible, but
1882 * should we have to, we jump to a separate loop. This
1883 * extra 'if' statement is what keeps this code from being
1884 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1885 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1886 goto found_above_latin1;
1887 }
1888 if ((UTF8_IS_INVARIANT(*s)
1889 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1890 classnum)))
1891 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1892 && to_complement ^ cBOOL(
94bb8c36
KW
1893 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1894 *(s + 1)),
3018b823
KW
1895 classnum))))
1896 {
02d5137b 1897 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
1898 goto got_it;
1899 else {
1900 tmp = doevery;
1901 }
1902 }
1903 else {
1904 tmp = 1;
1905 }
1906 s += UTF8SKIP(s);
1907 }
1908 }
1909 else switch (classnum) { /* These classes are implemented as
1910 macros */
1911 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1912 revert the change of \v matching this */
1913 /* FALL THROUGH */
1914
1915 case _CC_ENUM_PSXSPC:
1916 REXEC_FBC_UTF8_CLASS_SCAN(
1917 to_complement ^ cBOOL(isSPACE_utf8(s)));
1918 break;
1919
1920 case _CC_ENUM_BLANK:
1921 REXEC_FBC_UTF8_CLASS_SCAN(
1922 to_complement ^ cBOOL(isBLANK_utf8(s)));
1923 break;
1924
1925 case _CC_ENUM_XDIGIT:
1926 REXEC_FBC_UTF8_CLASS_SCAN(
1927 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1928 break;
1929
1930 case _CC_ENUM_VERTSPACE:
1931 REXEC_FBC_UTF8_CLASS_SCAN(
1932 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1933 break;
1934
1935 case _CC_ENUM_CNTRL:
1936 REXEC_FBC_UTF8_CLASS_SCAN(
1937 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1938 break;
1939
1940 default:
1941 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1942 assert(0); /* NOTREACHED */
1943 }
1944 }
1945 break;
1946
1947 found_above_latin1: /* Here we have to load a swash to get the result
1948 for the current code point */
1949 if (! PL_utf8_swash_ptrs[classnum]) {
1950 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1951 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
1952 _core_swash_init("utf8",
1953 "",
1954 &PL_sv_undef, 1, 0,
1955 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
1956 }
1957
1958 /* This is a copy of the loop above for swash classes, though using the
1959 * FBC macro instead of being expanded out. Since we've loaded the
1960 * swash, we don't have to check for that each time through the loop */
1961 REXEC_FBC_UTF8_CLASS_SCAN(
1962 to_complement ^ cBOOL(_generic_utf8(
1963 classnum,
1964 s,
1965 swash_fetch(PL_utf8_swash_ptrs[classnum],
1966 (U8 *) s, TRUE))));
73104a1b
KW
1967 break;
1968
1969 case AHOCORASICKC:
1970 case AHOCORASICK:
1971 {
1972 DECL_TRIE_TYPE(c);
1973 /* what trie are we using right now */
1974 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1975 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1976 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1977
1978 const char *last_start = strend - trie->minlen;
6148ee25 1979#ifdef DEBUGGING
73104a1b 1980 const char *real_start = s;
6148ee25 1981#endif
73104a1b
KW
1982 STRLEN maxlen = trie->maxlen;
1983 SV *sv_points;
1984 U8 **points; /* map of where we were in the input string
1985 when reading a given char. For ASCII this
1986 is unnecessary overhead as the relationship
1987 is always 1:1, but for Unicode, especially
1988 case folded Unicode this is not true. */
1989 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1990 U8 *bitmap=NULL;
1991
1992
1993 GET_RE_DEBUG_FLAGS_DECL;
1994
1995 /* We can't just allocate points here. We need to wrap it in
1996 * an SV so it gets freed properly if there is a croak while
1997 * running the match */
1998 ENTER;
1999 SAVETMPS;
2000 sv_points=newSV(maxlen * sizeof(U8 *));
2001 SvCUR_set(sv_points,
2002 maxlen * sizeof(U8 *));
2003 SvPOK_on(sv_points);
2004 sv_2mortal(sv_points);
2005 points=(U8**)SvPV_nolen(sv_points );
2006 if ( trie_type != trie_utf8_fold
2007 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2008 {
2009 if (trie->bitmap)
2010 bitmap=(U8*)trie->bitmap;
2011 else
2012 bitmap=(U8*)ANYOF_BITMAP(c);
2013 }
2014 /* this is the Aho-Corasick algorithm modified a touch
2015 to include special handling for long "unknown char" sequences.
2016 The basic idea being that we use AC as long as we are dealing
2017 with a possible matching char, when we encounter an unknown char
2018 (and we have not encountered an accepting state) we scan forward
2019 until we find a legal starting char.
2020 AC matching is basically that of trie matching, except that when
2021 we encounter a failing transition, we fall back to the current
2022 states "fail state", and try the current char again, a process
2023 we repeat until we reach the root state, state 1, or a legal
2024 transition. If we fail on the root state then we can either
2025 terminate if we have reached an accepting state previously, or
2026 restart the entire process from the beginning if we have not.
2027
2028 */
2029 while (s <= last_start) {
2030 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2031 U8 *uc = (U8*)s;
2032 U16 charid = 0;
2033 U32 base = 1;
2034 U32 state = 1;
2035 UV uvc = 0;
2036 STRLEN len = 0;
2037 STRLEN foldlen = 0;
2038 U8 *uscan = (U8*)NULL;
2039 U8 *leftmost = NULL;
2040#ifdef DEBUGGING
2041 U32 accepted_word= 0;
786e8c11 2042#endif
73104a1b
KW
2043 U32 pointpos = 0;
2044
2045 while ( state && uc <= (U8*)strend ) {
2046 int failed=0;
2047 U32 word = aho->states[ state ].wordnum;
2048
2049 if( state==1 ) {
2050 if ( bitmap ) {
2051 DEBUG_TRIE_EXECUTE_r(
2052 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2053 dump_exec_pos( (char *)uc, c, strend, real_start,
2054 (char *)uc, utf8_target );
2055 PerlIO_printf( Perl_debug_log,
2056 " Scanning for legal start char...\n");
2057 }
2058 );
2059 if (utf8_target) {
2060 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2061 uc += UTF8SKIP(uc);
2062 }
2063 } else {
2064 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2065 uc++;
2066 }
786e8c11 2067 }
73104a1b 2068 s= (char *)uc;
07be1b83 2069 }
73104a1b
KW
2070 if (uc >(U8*)last_start) break;
2071 }
2072
2073 if ( word ) {
2074 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2075 if (!leftmost || lpos < leftmost) {
2076 DEBUG_r(accepted_word=word);
2077 leftmost= lpos;
7016d6eb 2078 }
73104a1b 2079 if (base==0) break;
7016d6eb 2080
73104a1b
KW
2081 }
2082 points[pointpos++ % maxlen]= uc;
2083 if (foldlen || uc < (U8*)strend) {
2084 REXEC_TRIE_READ_CHAR(trie_type, trie,
2085 widecharmap, uc,
2086 uscan, len, uvc, charid, foldlen,
2087 foldbuf, uniflags);
2088 DEBUG_TRIE_EXECUTE_r({
2089 dump_exec_pos( (char *)uc, c, strend,
2090 real_start, s, utf8_target);
2091 PerlIO_printf(Perl_debug_log,
2092 " Charid:%3u CP:%4"UVxf" ",
2093 charid, uvc);
2094 });
2095 }
2096 else {
2097 len = 0;
2098 charid = 0;
2099 }
07be1b83 2100
73104a1b
KW
2101
2102 do {
6148ee25 2103#ifdef DEBUGGING
73104a1b 2104 word = aho->states[ state ].wordnum;
6148ee25 2105#endif
73104a1b
KW
2106 base = aho->states[ state ].trans.base;
2107
2108 DEBUG_TRIE_EXECUTE_r({
2109 if (failed)
2110 dump_exec_pos( (char *)uc, c, strend, real_start,
2111 s, utf8_target );
2112 PerlIO_printf( Perl_debug_log,
2113 "%sState: %4"UVxf", word=%"UVxf,
2114 failed ? " Fail transition to " : "",
2115 (UV)state, (UV)word);
2116 });
2117 if ( base ) {
2118 U32 tmp;
2119 I32 offset;
2120 if (charid &&
2121 ( ((offset = base + charid
2122 - 1 - trie->uniquecharcount)) >= 0)
2123 && ((U32)offset < trie->lasttrans)
2124 && trie->trans[offset].check == state
2125 && (tmp=trie->trans[offset].next))
2126 {
2127 DEBUG_TRIE_EXECUTE_r(
2128 PerlIO_printf( Perl_debug_log," - legal\n"));
2129 state = tmp;
2130 break;
07be1b83
YO
2131 }
2132 else {
786e8c11 2133 DEBUG_TRIE_EXECUTE_r(
73104a1b 2134 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2135 failed = 1;
73104a1b 2136 state = aho->fail[state];
07be1b83 2137 }
07be1b83 2138 }
73104a1b
KW
2139 else {
2140 /* we must be accepting here */
2141 DEBUG_TRIE_EXECUTE_r(
2142 PerlIO_printf( Perl_debug_log," - accepting\n"));
2143 failed = 1;
2144 break;
786e8c11 2145 }
73104a1b
KW
2146 } while(state);
2147 uc += len;
2148 if (failed) {
2149 if (leftmost)
2150 break;
2151 if (!state) state = 1;
07be1b83 2152 }
73104a1b
KW
2153 }
2154 if ( aho->states[ state ].wordnum ) {
2155 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2156 if (!leftmost || lpos < leftmost) {
2157 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2158 leftmost = lpos;
07be1b83
YO
2159 }
2160 }
73104a1b
KW
2161 if (leftmost) {
2162 s = (char*)leftmost;
2163 DEBUG_TRIE_EXECUTE_r({
2164 PerlIO_printf(
2165 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2166 (UV)accepted_word, (IV)(s - real_start)
2167 );
2168 });
02d5137b 2169 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2170 FREETMPS;
2171 LEAVE;
2172 goto got_it;
2173 }
2174 s = HOPc(s,1);
2175 DEBUG_TRIE_EXECUTE_r({
2176 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2177 });
2178 } else {
2179 DEBUG_TRIE_EXECUTE_r(
2180 PerlIO_printf( Perl_debug_log,"No match.\n"));
2181 break;
2182 }
2183 }
2184 FREETMPS;
2185 LEAVE;
2186 }
2187 break;
2188 default:
2189 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2190 break;
2191 }
2192 return 0;
2193 got_it:
2194 return s;
6eb5f6b9
JH
2195}
2196
60165aa4
DM
2197/* set RX_SAVED_COPY, RX_SUBBEG etc.
2198 * flags have same meanings as with regexec_flags() */
2199
749f4950
DM
2200static void
2201S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2202 char *strbeg,
2203 char *strend,
2204 SV *sv,
2205 U32 flags,
2206 bool utf8_target)
2207{
2208 struct regexp *const prog = ReANY(rx);
2209
60165aa4
DM
2210 if (flags & REXEC_COPY_STR) {
2211#ifdef PERL_ANY_COW
2212 if (SvCANCOW(sv)) {
2213 if (DEBUG_C_TEST) {
2214 PerlIO_printf(Perl_debug_log,
2215 "Copy on write: regexp capture, type %d\n",
2216 (int) SvTYPE(sv));
2217 }
5411a0e5
DM
2218 /* Create a new COW SV to share the match string and store
2219 * in saved_copy, unless the current COW SV in saved_copy
2220 * is valid and suitable for our purpose */
2221 if (( prog->saved_copy
2222 && SvIsCOW(prog->saved_copy)
2223 && SvPOKp(prog->saved_copy)
2224 && SvIsCOW(sv)
2225 && SvPOKp(sv)
2226 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2227 {
5411a0e5
DM
2228 /* just reuse saved_copy SV */
2229 if (RXp_MATCH_COPIED(prog)) {
2230 Safefree(prog->subbeg);
2231 RXp_MATCH_COPIED_off(prog);
2232 }
2233 }
2234 else {
2235 /* create new COW SV to share string */
a76b0e90
DM
2236 RX_MATCH_COPY_FREE(rx);
2237 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2238 }
5411a0e5
DM
2239 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2240 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2241 prog->sublen = strend - strbeg;
2242 prog->suboffset = 0;
2243 prog->subcoffset = 0;
2244 } else
2245#endif
2246 {
99a90e59
FC
2247 SSize_t min = 0;
2248 SSize_t max = strend - strbeg;
ea3daa5d 2249 SSize_t sublen;
60165aa4
DM
2250
2251 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2252 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2253 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2254 ) { /* don't copy $' part of string */
2255 U32 n = 0;
2256 max = -1;
2257 /* calculate the right-most part of the string covered
2258 * by a capture. Due to look-ahead, this may be to
2259 * the right of $&, so we have to scan all captures */
2260 while (n <= prog->lastparen) {
2261 if (prog->offs[n].end > max)
2262 max = prog->offs[n].end;
2263 n++;
2264 }
2265 if (max == -1)
2266 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2267 ? prog->offs[0].start
2268 : 0;
2269 assert(max >= 0 && max <= strend - strbeg);
2270 }
2271
2272 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2273 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2274 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2275 ) { /* don't copy $` part of string */
2276 U32 n = 0;
2277 min = max;
2278 /* calculate the left-most part of the string covered
2279 * by a capture. Due to look-behind, this may be to
2280 * the left of $&, so we have to scan all captures */
2281 while (min && n <= prog->lastparen) {
2282 if ( prog->offs[n].start != -1
2283 && prog->offs[n].start < min)
2284 {
2285 min = prog->offs[n].start;
2286 }
2287 n++;
2288 }
2289 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2290 && min > prog->offs[0].end
2291 )
2292 min = prog->offs[0].end;
2293
2294 }
2295
2296 assert(min >= 0 && min <= max && min <= strend - strbeg);
2297 sublen = max - min;
2298
2299 if (RX_MATCH_COPIED(rx)) {
2300 if (sublen > prog->sublen)
2301 prog->subbeg =
2302 (char*)saferealloc(prog->subbeg, sublen+1);
2303 }
2304 else
2305 prog->subbeg = (char*)safemalloc(sublen+1);
2306 Copy(strbeg + min, prog->subbeg, sublen, char);
2307 prog->subbeg[sublen] = '\0';
2308 prog->suboffset = min;
2309 prog->sublen = sublen;
2310 RX_MATCH_COPIED_on(rx);
2311 }
2312 prog->subcoffset = prog->suboffset;
2313 if (prog->suboffset && utf8_target) {
2314 /* Convert byte offset to chars.
2315 * XXX ideally should only compute this if @-/@+
2316 * has been seen, a la PL_sawampersand ??? */
2317
2318 /* If there's a direct correspondence between the
2319 * string which we're matching and the original SV,
2320 * then we can use the utf8 len cache associated with
2321 * the SV. In particular, it means that under //g,
2322 * sv_pos_b2u() will use the previously cached
2323 * position to speed up working out the new length of
2324 * subcoffset, rather than counting from the start of
2325 * the string each time. This stops
2326 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2327 * from going quadratic */
2328 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2329 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2330 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2331 else
2332 prog->subcoffset = utf8_length((U8*)strbeg,
2333 (U8*)(strbeg+prog->suboffset));
2334 }
2335 }
2336 else {
2337 RX_MATCH_COPY_FREE(rx);
2338 prog->subbeg = strbeg;
2339 prog->suboffset = 0;
2340 prog->subcoffset = 0;
2341 prog->sublen = strend - strbeg;
2342 }
2343}
2344
2345
2346
fae667d5 2347
6eb5f6b9
JH
2348/*
2349 - regexec_flags - match a regexp against a string
2350 */
2351I32
5aaab254 2352Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2353 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2354/* stringarg: the point in the string at which to begin matching */
2355/* strend: pointer to null at end of string */
2356/* strbeg: real beginning of string */
2357/* minend: end of match must be >= minend bytes after stringarg. */
2358/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2359 * itself is accessed via the pointers above */
2360/* data: May be used for some additional optimizations.
d058ec57 2361 Currently unused. */
a340edde 2362/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2363
6eb5f6b9 2364{
97aff369 2365 dVAR;
8d919b0a 2366 struct regexp *const prog = ReANY(rx);
5aaab254 2367 char *s;
eb578fdb 2368 regnode *c;
03c83e26 2369 char *startpos;
ea3daa5d
FC
2370 SSize_t minlen; /* must match at least this many chars */
2371 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2372 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2373 I32 multiline;
f8fc2ecf 2374 RXi_GET_DECL(prog,progi);
02d5137b
DM
2375 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2376 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2377 regexp_paren_pair *swap = NULL;
006f26b2 2378 I32 oldsave;
a3621e74
YO
2379 GET_RE_DEBUG_FLAGS_DECL;
2380
7918f24d 2381 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2382 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2383
2384 /* Be paranoid... */
03c83e26 2385 if (prog == NULL || stringarg == NULL) {
6eb5f6b9
JH
2386 Perl_croak(aTHX_ "NULL regexp parameter");
2387 return 0;
2388 }
2389
6c3fea77 2390 DEBUG_EXECUTE_r(
03c83e26 2391 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2392 "Matching");
2393 );
8adc0f72 2394
b342a604
DM
2395 startpos = stringarg;
2396
58430ea8 2397 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2398 MAGIC *mg;
2399
fef7148b
DM
2400 /* set reginfo->ganch, the position where \G can match */
2401
2402 reginfo->ganch =
2403 (flags & REXEC_IGNOREPOS)
2404 ? stringarg /* use start pos rather than pos() */
2405 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2406 /* Defined pos(): */
2407 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2408 : strbeg; /* pos() not defined; use start of string */
2409
2410 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2411 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2412
03c83e26
DM
2413 /* in the presence of \G, we may need to start looking earlier in
2414 * the string than the suggested start point of stringarg:
0b2c2a84 2415 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2416 * offset, such as
2417 * /..\G/: gofs = 2
2418 * /ab|c\G/: gofs = 1
2419 * or if the minimum offset isn't known, then we have to go back
2420 * to the start of the string, e.g. /w+\G/
2421 */
2bfbe302 2422
8e1490ee 2423 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2424 startpos = reginfo->ganch - prog->gofs;
2425 if (startpos <
2426 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2427 {
2428 DEBUG_r(PerlIO_printf(Perl_debug_log,
2429 "fail: ganch-gofs before earliest possible start\n"));
2430 return 0;
2431 }
2432 }
2433 else if (prog->gofs) {
b342a604
DM
2434 if (startpos - prog->gofs < strbeg)
2435 startpos = strbeg;
2436 else
2437 startpos -= prog->gofs;
03c83e26 2438 }
58430ea8 2439 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2440 startpos = strbeg;
03c83e26
DM
2441 }
2442
2443 minlen = prog->minlen;
b342a604 2444 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2445 DEBUG_r(PerlIO_printf(Perl_debug_log,
2446 "Regex match can't succeed, so not even tried\n"));
2447 return 0;
2448 }
2449
63a3746a
DM
2450 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2451 * which will call destuctors to reset PL_regmatch_state, free higher
2452 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2453 * regmatch_info_aux_eval */
2454
2455 oldsave = PL_savestack_ix;
2456
dfa77d06
DM
2457 s = startpos;
2458
e322109a 2459 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2460 && !(flags & REXEC_CHECKED))
2461 {
dfa77d06 2462 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2463 flags, NULL);
dfa77d06 2464 if (!s)
7fadf4a7
DM
2465 return 0;
2466
e322109a 2467 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2468 /* we can match based purely on the result of INTUIT.
2469 * Set up captures etc just for $& and $-[0]
2470 * (an intuit-only match wont have $1,$2,..) */
2471 assert(!prog->nparens);
d5e7783a
DM
2472
2473 /* s/// doesn't like it if $& is earlier than where we asked it to
2474 * start searching (which can happen on something like /.\G/) */
2475 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2476 && (s < stringarg))
2477 {
2478 /* this should only be possible under \G */
58430ea8 2479 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2480 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2481 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2482 goto phooey;
2483 }
2484
7fadf4a7
DM
2485 /* match via INTUIT shouldn't have any captures.
2486 * Let @-, @+, $^N know */
2487 prog->lastparen = prog->lastcloseparen = 0;
2488 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2489 prog->offs[0].start = s - strbeg;
2490 prog->offs[0].end = utf8_target
2491 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2492 : s - strbeg + prog->minlenret;
7fadf4a7 2493 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2494 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2495 strbeg, strend,
2496 sv, flags, utf8_target);
2497
7fadf4a7
DM
2498 return 1;
2499 }
2500 }
2501
6c3fea77 2502 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2503
dfa77d06 2504 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2505 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2506 "String too short [regexec_flags]...\n"));
2507 goto phooey;
1aa99e6b 2508 }
1de06328 2509
6eb5f6b9 2510 /* Check validity of program. */
f8fc2ecf 2511 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2512 Perl_croak(aTHX_ "corrupted regexp program");
2513 }
2514
6c3fea77
DM
2515 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2516 reginfo->intuit = 0;
2517 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2518 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2519 reginfo->warned = FALSE;
9d9163fb 2520 reginfo->strbeg = strbeg;
02d5137b 2521 reginfo->sv = sv;
1cb48e53 2522 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2523 reginfo->strend = strend;
6eb5f6b9 2524 /* see how far we have to get to not match where we matched before */
fe3974be 2525 reginfo->till = stringarg + minend;
6eb5f6b9 2526
60779a30 2527 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
2528 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2529 S_cleanup_regmatch_info_aux has executed (registered by
2530 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2531 magic belonging to this SV.
2532 Not newSVsv, either, as it does not COW.
2533 */
60779a30 2534 assert(!IS_PADGV(sv));
82c23608 2535 reginfo->sv = newSV(0);
4cba5ac0 2536 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2537 SAVEFREESV(reginfo->sv);
2538 }
2539
331b2dcc
DM
2540 /* reserve next 2 or 3 slots in PL_regmatch_state:
2541 * slot N+0: may currently be in use: skip it
2542 * slot N+1: use for regmatch_info_aux struct
2543 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2544 * slot N+3: ready for use by regmatch()
2545 */
bf2039a9 2546
331b2dcc
DM
2547 {
2548 regmatch_state *old_regmatch_state;
2549 regmatch_slab *old_regmatch_slab;
2550 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2551
2552 /* on first ever match, allocate first slab */
2553 if (!PL_regmatch_slab) {
2554 Newx(PL_regmatch_slab, 1, regmatch_slab);
2555 PL_regmatch_slab->prev = NULL;
2556 PL_regmatch_slab->next = NULL;
2557 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2558 }
bf2039a9 2559
331b2dcc
DM
2560 old_regmatch_state = PL_regmatch_state;
2561 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2562
331b2dcc
DM
2563 for (i=0; i <= max; i++) {
2564 if (i == 1)
2565 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2566 else if (i ==2)
2567 reginfo->info_aux_eval =
2568 reginfo->info_aux->info_aux_eval =
2569 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2570
331b2dcc
DM
2571 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2572 PL_regmatch_state = S_push_slab(aTHX);
2573 }
bf2039a9 2574
331b2dcc
DM
2575 /* note initial PL_regmatch_state position; at end of match we'll
2576 * pop back to there and free any higher slabs */
bf2039a9 2577
331b2dcc
DM
2578 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2579 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2580 reginfo->info_aux->poscache = NULL;
bf2039a9 2581
331b2dcc 2582 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2583
331b2dcc
DM
2584 if ((prog->extflags & RXf_EVAL_SEEN))
2585 S_setup_eval_state(aTHX_ reginfo);
2586 else
2587 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2588 }
d3aa529c 2589
6eb5f6b9 2590 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2591
288b8c02 2592 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2593 /* We have to be careful. If the previous successful match
2594 was from this regex we don't want a subsequent partially
2595 successful match to clobber the old results.
2596 So when we detect this possibility we add a swap buffer
d8da0584
KW
2597 to the re, and switch the buffer each match. If we fail,
2598 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2599 */
2600 swap = prog->offs;
2601 /* do we need a save destructor here for eval dies? */
2602 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2603 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2604 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2605 PTR2UV(prog),
2606 PTR2UV(swap),
2607 PTR2UV(prog->offs)
2608 ));
c74340f9 2609 }
6eb5f6b9
JH
2610
2611 /* Simplest case: anchored match need be tried only once. */
2612 /* [unless only anchor is BOL and multiline is set] */
8e1490ee 2613 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3542935d 2614 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2615 goto got_it;
8e1490ee 2616 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
6eb5f6b9
JH
2617 {
2618 char *end;
2619
2620 if (minlen)
2621 dontbother = minlen - 1;
1aa99e6b 2622 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2623 /* for multiline we only have to try after newlines */
33b8afdf 2624 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2625 /* because of the goto we can not easily reuse the macros for bifurcating the
2626 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2627 if (utf8_target) {
2628 if (s == startpos)
2629 goto after_try_utf8;
2630 while (1) {
02d5137b 2631 if (regtry(reginfo, &s)) {
92f3d482
YO
2632 goto got_it;
2633 }
2634 after_try_utf8:
2635 if (s > end) {
2636 goto phooey;
2637 }
2638 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2639 s = re_intuit_start(rx, sv, strbeg,
2640 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2641 if (!s) {
2642 goto phooey;
2643 }
2644 }
2645 else {
2646 s += UTF8SKIP(s);
2647 }
2648 }
2649 } /* end search for check string in unicode */
2650 else {
2651 if (s == startpos) {
2652 goto after_try_latin;
2653 }
2654 while (1) {
02d5137b 2655 if (regtry(reginfo, &s)) {
92f3d482
YO
2656 goto got_it;
2657 }
2658 after_try_latin:
2659 if (s > end) {
2660 goto phooey;
2661 }
2662 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2663 s = re_intuit_start(rx, sv, strbeg,
2664 s + 1, strend, flags, NULL);
92f3d482
YO
2665 if (!s) {
2666 goto phooey;
2667 }
2668 }
2669 else {
2670 s++;
2671 }
2672 }
2673 } /* end search for check string in latin*/
2674 } /* end search for check string */
2675 else { /* search for newline */
2676 if (s > startpos) {
2677 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2678 s--;
92f3d482 2679 }
21eede78
YO
2680 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2681 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2682 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2683 if (regtry(reginfo, &s))
6eb5f6b9
JH
2684 goto got_it;
2685 }
92f3d482
YO
2686 }
2687 } /* end search for newline */
2688 } /* end anchored/multiline check string search */
6eb5f6b9 2689 goto phooey;
a8430a8b 2690 } else if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 2691 {
a8430a8b
YO
2692 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2693 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
2694 /* For anchored \G, the only position it can match from is
2695 * (ganch-gofs); we already set startpos to this above; if intuit
2696 * moved us on from there, we can't possibly succeed */
2697 assert(startpos == reginfo->ganch - prog->gofs);
2698 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2699 goto got_it;
2700 goto phooey;
2701 }
2702
2703 /* Messy cases: unanchored match. */
bbe252da 2704 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2705 /* we have /x+whatever/ */
984e6dd1 2706 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2707 char ch;
bf93d4cc
GS
2708#ifdef DEBUGGING
2709 int did_match = 0;
2710#endif
f2ed9b32 2711 if (utf8_target) {
7e0d5ad7
KW
2712 if (! prog->anchored_utf8) {
2713 to_utf8_substr(prog);
2714 }
2715 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2716 REXEC_FBC_SCAN(
6eb5f6b9 2717 if (*s == ch) {
a3621e74 2718 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2719 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2720 s += UTF8SKIP(s);
2721 while (s < strend && *s == ch)
2722 s += UTF8SKIP(s);
2723 }
4cadc6a9 2724 );
7e0d5ad7 2725
6eb5f6b9
JH
2726 }
2727 else {
7e0d5ad7
KW
2728 if (! prog->anchored_substr) {
2729 if (! to_byte_substr(prog)) {
6b54ddc5 2730 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2731 }
2732 }
2733 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2734 REXEC_FBC_SCAN(
6eb5f6b9 2735 if (*s == ch) {
a3621e74 2736 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2737 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2738 s++;
2739 while (s < strend && *s == ch)
2740 s++;
2741 }
4cadc6a9 2742 );
6eb5f6b9 2743 }
a3621e74 2744 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2745 PerlIO_printf(Perl_debug_log,
b7953727
JH
2746 "Did not find anchored character...\n")
2747 );
6eb5f6b9 2748 }
a0714e2c
SS
2749 else if (prog->anchored_substr != NULL
2750 || prog->anchored_utf8 != NULL
2751 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2752 && prog->float_max_offset < strend - s)) {
2753 SV *must;
ea3daa5d
FC
2754 SSize_t back_max;
2755 SSize_t back_min;
33b8afdf 2756 char *last;
6eb5f6b9 2757 char *last1; /* Last position checked before */
bf93d4cc
GS
2758#ifdef DEBUGGING
2759 int did_match = 0;
2760#endif
33b8afdf 2761 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2762 if (utf8_target) {
2763 if (! prog->anchored_utf8) {
2764 to_utf8_substr(prog);
2765 }
2766 must = prog->anchored_utf8;
2767 }
2768 else {
2769 if (! prog->anchored_substr) {
2770 if (! to_byte_substr(prog)) {
6b54ddc5 2771 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2772 }
2773 }
2774 must = prog->anchored_substr;
2775 }
33b8afdf
JH
2776 back_max = back_min = prog->anchored_offset;
2777 } else {
7e0d5ad7
KW
2778 if (utf8_target) {
2779 if (! prog->float_utf8) {
2780 to_utf8_substr(prog);
2781 }
2782 must = prog->float_utf8;
2783 }
2784 else {
2785 if (! prog->float_substr) {
2786 if (! to_byte_substr(prog)) {
6b54ddc5 2787 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2788 }
2789 }
2790 must = prog->float_substr;
2791 }
33b8afdf
JH
2792 back_max = prog->float_max_offset;
2793 back_min = prog->float_min_offset;
2794 }
1de06328 2795
1de06328
YO
2796 if (back_min<0) {
2797 last = strend;
2798 } else {
2799 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 2800 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
2801 - (SvTAIL(must) != 0) + back_min), strbeg);
2802 }
9d9163fb 2803 if (s > reginfo->strbeg)
6eb5f6b9
JH
2804 last1 = HOPc(s, -1);
2805 else
2806 last1 = s - 1; /* bogus */
2807
a0288114 2808 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2809 check_substr==must. */
bf05793b 2810 dontbother = 0;
6eb5f6b9
JH
2811 strend = HOPc(strend, -dontbother);
2812 while ( (s <= last) &&
e50d57d4 2813 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 2814 (unsigned char*)strend, must,
c33e64f0 2815 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2816 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2817 if (HOPc(s, -back_max) > last1) {
2818 last1 = HOPc(s, -back_min);
2819 s = HOPc(s, -back_max);
2820 }
2821 else {
9d9163fb
DM
2822 char * const t = (last1 >= reginfo->strbeg)
2823 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2824
2825 last1 = HOPc(s, -back_min);
52657f30 2826 s = t;
6eb5f6b9 2827 }
f2ed9b32 2828 if (utf8_target) {
6eb5f6b9 2829 while (s <= last1) {
02d5137b 2830 if (regtry(reginfo, &s))
6eb5f6b9 2831 goto got_it;
7016d6eb
DM
2832 if (s >= last1) {
2833 s++; /* to break out of outer loop */
2834 break;
2835 }
2836 s += UTF8SKIP(s);
6eb5f6b9
JH
2837 }
2838 }
2839 else {
2840 while (s <= last1) {
02d5137b 2841 if (regtry(reginfo, &s))
6eb5f6b9
JH
2842 goto got_it;
2843 s++;
2844 }
2845 }
2846 }
ab3bbdeb 2847 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2848 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2849 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2850 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2851 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2852 ? "anchored" : "floating"),
ab3bbdeb
YO
2853 quoted, RE_SV_TAIL(must));
2854 });
6eb5f6b9
JH
2855 goto phooey;
2856 }
f8fc2ecf 2857 else if ( (c = progi->regstclass) ) {
f14c76ed 2858 if (minlen) {
f8fc2ecf 2859 const OPCODE op = OP(progi->regstclass);
66e933ab 2860 /* don't bother with what can't match */
786e8c11 2861 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2862 strend = HOPc(strend, -(minlen - 1));
2863 }
a3621e74 2864 DEBUG_EXECUTE_r({
be8e71aa 2865 SV * const prop = sv_newmortal();
2395827c 2866 regprop(prog, prop, c, reginfo);
0df25f3d 2867 {
f2ed9b32 2868 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2869 s,strend-s,60);
0df25f3d 2870 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2871 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2872 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2873 quoted, (int)(strend - s));
0df25f3d 2874 }
ffc61ed2 2875 });
f9176b44 2876 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2877 goto got_it;
07be1b83 2878 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2879 }
2880 else {
2881 dontbother = 0;
a0714e2c 2882 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2883 /* Trim the end. */
6af40bd7 2884 char *last= NULL;
33b8afdf 2885 SV* float_real;
c33e64f0
FC
2886 STRLEN len;
2887 const char *little;
33b8afdf 2888
7e0d5ad7
KW
2889 if (utf8_target) {
2890 if (! prog->float_utf8) {
2891 to_utf8_substr(prog);
2892 }
2893 float_real = prog->float_utf8;
2894 }
2895 else {
2896 if (! prog->float_substr) {
2897 if (! to_byte_substr(prog)) {
6b54ddc5 2898 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2899 }
2900 }
2901 float_real = prog->float_substr;
2902 }
d6a28714 2903
c33e64f0
FC
2904 little = SvPV_const(float_real, len);
2905 if (SvTAIL(float_real)) {
7f18ad16
KW
2906 /* This means that float_real contains an artificial \n on
2907 * the end due to the presence of something like this:
2908 * /foo$/ where we can match both "foo" and "foo\n" at the
2909 * end of the string. So we have to compare the end of the
2910 * string first against the float_real without the \n and
2911 * then against the full float_real with the string. We
2912 * have to watch out for cases where the string might be
2913 * smaller than the float_real or the float_real without
2914 * the \n. */
1a13b075
YO
2915 char *checkpos= strend - len;
2916 DEBUG_OPTIMISE_r(
2917 PerlIO_printf(Perl_debug_log,
2918 "%sChecking for float_real.%s\n",
2919 PL_colors[4], PL_colors[5]));
2920 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2921 /* can't match, even if we remove the trailing \n
2922 * string is too short to match */
1a13b075
YO
2923 DEBUG_EXECUTE_r(
2924 PerlIO_printf(Perl_debug_log,
2925 "%sString shorter than required trailing substring, cannot match.%s\n",
2926 PL_colors[4], PL_colors[5]));
2927 goto phooey;
2928 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2929 /* can match, the end of the string matches without the
2930 * "\n" */
1a13b075
YO
2931 last = checkpos + 1;
2932 } else if (checkpos < strbeg) {
7f18ad16
KW
2933 /* cant match, string is too short when the "\n" is
2934 * included */
1a13b075
YO
2935 DEBUG_EXECUTE_r(
2936 PerlIO_printf(Perl_debug_log,
2937 "%sString does not contain required trailing substring, cannot match.%s\n",
2938 PL_colors[4], PL_colors[5]));
2939 goto phooey;
2940 } else if (!multiline) {
7f18ad16
KW
2941 /* non multiline match, so compare with the "\n" at the
2942 * end of the string */
1a13b075
YO
2943 if (memEQ(checkpos, little, len)) {
2944 last= checkpos;
2945 } else {
2946 DEBUG_EXECUTE_r(
2947 PerlIO_printf(Perl_debug_log,
2948 "%sString does not contain required trailing substring, cannot match.%s\n",
2949 PL_colors[4], PL_colors[5]));
2950 goto phooey;
2951 }
2952 } else {
7f18ad16
KW
2953 /* multiline match, so we have to search for a place
2954 * where the full string is located */
d6a28714 2955 goto find_last;
1a13b075 2956 }
c33e64f0 2957 } else {
d6a28714 2958 find_last:
9041c2e3 2959 if (len)
d6a28714 2960 last = rninstr(s, strend, little, little + len);
b8c5462f 2961 else
a0288114 2962 last = strend; /* matching "$" */
b8c5462f 2963 }
6af40bd7 2964 if (!last) {
7f18ad16
KW
2965 /* at one point this block contained a comment which was
2966 * probably incorrect, which said that this was a "should not
2967 * happen" case. Even if it was true when it was written I am
2968 * pretty sure it is not anymore, so I have removed the comment
2969 * and replaced it with this one. Yves */
6bda09f9
YO
2970 DEBUG_EXECUTE_r(
2971 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2972 "String does not contain required substring, cannot match.\n"
2973 ));
2974 goto phooey;
bf93d4cc 2975 }
d6a28714
JH
2976 dontbother = strend - last + prog->float_min_offset;
2977 }
2978 if (minlen && (dontbother < minlen))
2979 dontbother = minlen - 1;
2980 strend -= dontbother; /* this one's always in bytes! */
2981 /* We don't know much -- general case. */
f2ed9b32 2982 if (utf8_target) {
d6a28714 2983 for (;;) {
02d5137b 2984 if (regtry(reginfo, &s))
d6a28714
JH
2985 goto got_it;
2986 if (s >= strend)
2987 break;
b8c5462f 2988 s += UTF8SKIP(s);
d6a28714
JH
2989 };
2990 }
2991 else {
2992 do {
02d5137b 2993 if (regtry(reginfo, &s))
d6a28714
JH
2994 goto got_it;
2995 } while (s++ < strend);
2996 }
2997 }
2998
2999 /* Failure. */
3000 goto phooey;
3001
3002got_it:
d5e7783a
DM
3003 /* s/// doesn't like it if $& is earlier than where we asked it to
3004 * start searching (which can happen on something like /.\G/) */
3005 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3006 && (prog->offs[0].start < stringarg - strbeg))
3007 {
3008 /* this should only be possible under \G */
58430ea8 3009 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3010 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3011 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3012 goto phooey;
3013 }
3014
495f47a5
DM
3015 DEBUG_BUFFERS_r(
3016 if (swap)
3017 PerlIO_printf(Perl_debug_log,
3018 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3019 PTR2UV(prog),
3020 PTR2UV(swap)
3021 );
3022 );
e9105d30 3023 Safefree(swap);
d6a28714 3024
bf2039a9
DM
3025 /* clean up; this will trigger destructors that will free all slabs
3026 * above the current one, and cleanup the regmatch_info_aux
3027 * and regmatch_info_aux_eval sructs */
8adc0f72 3028
006f26b2
DM
3029 LEAVE_SCOPE(oldsave);
3030
5daac39c
NC
3031 if (RXp_PAREN_NAMES(prog))
3032 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 3033
0254aed9
DM
3034 RX_MATCH_UTF8_set(rx, utf8_target);
3035
d6a28714 3036 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3037 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3038 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3039 strbeg, reginfo->strend,
3040 sv, flags, utf8_target);
9041c2e3 3041
d6a28714
JH
3042 return 1;
3043
3044phooey:
a3621e74 3045 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3046 PL_colors[4], PL_colors[5]));
8adc0f72 3047
bf2039a9
DM
3048 /* clean up; this will trigger destructors that will free all slabs
3049 * above the current one, and cleanup the regmatch_info_aux
3050 * and regmatch_info_aux_eval sructs */
8adc0f72 3051
006f26b2
DM
3052 LEAVE_SCOPE(oldsave);
3053
e9105d30 3054 if (swap) {
c74340f9 3055 /* we failed :-( roll it back */
495f47a5
DM
3056 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3057 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3058 PTR2UV(prog),
3059 PTR2UV(prog->offs),
3060 PTR2UV(swap)
3061 ));
e9105d30
GG
3062 Safefree(prog->offs);
3063 prog->offs = swap;
3064 }
d6a28714
JH
3065 return 0;
3066}
3067
6bda09f9 3068
b3d298be 3069/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3070 * Do inc before dec, in case old and new rex are the same */
baa60164 3071#define SET_reg_curpm(Re2) \
bf2039a9 3072 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3073 (void)ReREFCNT_inc(Re2); \
3074 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3075 PM_SETRE((PL_reg_curpm), (Re2)); \
3076 }
3077
3078
d6a28714
JH
3079/*
3080 - regtry - try match at specific point
3081 */
3082STATIC I32 /* 0 failure, 1 success */
f73aaa43 3083S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3084{
97aff369 3085 dVAR;
d6a28714 3086 CHECKPOINT lastcp;
288b8c02 3087 REGEXP *const rx = reginfo->prog;
8d919b0a 3088 regexp *const prog = ReANY(rx);
99a90e59 3089 SSize_t result;
f8fc2ecf 3090 RXi_GET_DECL(prog,progi);
a3621e74 3091 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3092
3093 PERL_ARGS_ASSERT_REGTRY;
3094
24b23f37 3095 reginfo->cutpoint=NULL;
d6a28714 3096
9d9163fb 3097 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3098 prog->lastparen = 0;
03994de8 3099 prog->lastcloseparen = 0;
d6a28714
JH
3100
3101 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3102 to do this again and again, prog->lastparen should take care of
3dd2943c 3103 this! --ilya*/
dafc8851
JH
3104
3105 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3106 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3107 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3108 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3109 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3110 * Meanwhile, this code *is* needed for the
daf18116
JH
3111 * above-mentioned test suite tests to succeed. The common theme
3112 * on those tests seems to be returning null fields from matches.
225593e1 3113 * --jhi updated by dapm */
dafc8851 3114#if 1
d6a28714 3115 if (prog->nparens) {
b93070ed 3116 regexp_paren_pair *pp = prog->offs;
eb578fdb 3117 I32 i;
b93070ed 3118 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3119 ++pp;
3120 pp->start = -1;
3121 pp->end = -1;
d6a28714
JH
3122 }
3123 }
dafc8851 3124#endif
02db2b7b 3125 REGCP_SET(lastcp);
f73aaa43
DM
3126 result = regmatch(reginfo, *startposp, progi->program + 1);
3127 if (result != -1) {
3128 prog->offs[0].end = result;
d6a28714
JH
3129 return 1;
3130 }
24b23f37 3131 if (reginfo->cutpoint)
f73aaa43 3132 *startposp= reginfo->cutpoint;
02db2b7b 3133 REGCP_UNWIND(lastcp);
d6a28714
JH
3134 return 0;
3135}
3136
02db2b7b 3137
8ba1375e
MJD
3138#define sayYES goto yes
3139#define sayNO goto no
262b90c4 3140#define sayNO_SILENT goto no_silent
8ba1375e 3141
f9f4320a
YO
3142/* we dont use STMT_START/END here because it leads to
3143 "unreachable code" warnings, which are bogus, but distracting. */
3144#define CACHEsayNO \
c476f425 3145 if (ST.cache_mask) \
2ac8ff4b 3146 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3147 sayNO
3298f257 3148
a3621e74 3149/* this is used to determine how far from the left messages like
265c4333
YO
3150 'failed...' are printed. It should be set such that messages
3151 are inline with the regop output that created them.
a3621e74 3152*/
265c4333 3153#define REPORT_CODE_OFF 32
a3621e74
YO
3154
3155
40a82448
DM
3156#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3157#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3158#define CHRTEST_NOT_A_CP_1 -999
3159#define CHRTEST_NOT_A_CP_2 -998
9e137952 3160
5d9a96ca
DM
3161/* grab a new slab and return the first slot in it */
3162
3163STATIC regmatch_state *
3164S_push_slab(pTHX)
3165{
a35a87e7 3166#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3167 dMY_CXT;
3168#endif
5d9a96ca
DM
3169 regmatch_slab *s = PL_regmatch_slab->next;
3170 if (!s) {
3171 Newx(s, 1, regmatch_slab);
3172 s->prev = PL_regmatch_slab;
3173 s->next = NULL;
3174 PL_regmatch_slab->next = s;
3175 }
3176 PL_regmatch_slab = s;
86545054 3177 return SLAB_FIRST(s);
5d9a96ca 3178}
5b47454d 3179
95b24440 3180
40a82448
DM
3181/* push a new state then goto it */
3182
4d5016e5
DM
3183#define PUSH_STATE_GOTO(state, node, input) \
3184 pushinput = input; \
40a82448
DM
3185 scan = node; \
3186 st->resume_state = state; \
3187 goto push_state;
3188
3189/* push a new state with success backtracking, then goto it */
3190
4d5016e5
DM
3191#define PUSH_YES_STATE_GOTO(state, node, input) \
3192 pushinput = input; \
40a82448
DM
3193 scan = node; \
3194 st->resume_state = state; \
3195 goto push_yes_state;
3196
aa283a38 3197
aa283a38 3198
4d5016e5 3199
d6a28714 3200/*
95b24440 3201
bf1f174e
DM
3202regmatch() - main matching routine
3203
3204This is basically one big switch statement in a loop. We execute an op,
3205set 'next' to point the next op, and continue. If we come to a point which
3206we may need to backtrack to on failure such as (A|B|C), we push a
3207backtrack state onto the backtrack stack. On failure, we pop the top
3208state, and re-enter the loop at the state indicated. If there are no more
3209states to pop, we return failure.
3210
3211Sometimes we also need to backtrack on success; for example /A+/, where
3212after successfully matching one A, we need to go back and try to
3213match another one; similarly for lookahead assertions: if the assertion
3214completes successfully, we backtrack to the state just before the assertion
3215and then carry on. In these cases, the pushed state is marked as
3216'backtrack on success too'. This marking is in fact done by a chain of
3217pointers, each pointing to the previous 'yes' state. On success, we pop to
3218the nearest yes state, discarding any intermediate failure-only states.
3219Sometimes a yes state is pushed just to force some cleanup code to be
3220called at the end of a successful match or submatch; e.g. (??{$re}) uses
3221it to free the inner regex.
3222
3223Note that failure backtracking rewinds the cursor position, while
3224success backtracking leaves it alone.
3225
3226A pattern is complete when the END op is executed, while a subpattern
3227such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3228ops trigger the "pop to last yes state if any, otherwise return true"
3229behaviour.
3230
3231A common convention in this function is to use A and B to refer to the two
3232subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3233the subpattern to be matched possibly multiple times, while B is the entire
3234rest of the pattern. Variable and state names reflect this convention.
3235
3236The states in the main switch are the union of ops and failure/success of
3237substates associated with with that op. For example, IFMATCH is the op
3238that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3239'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3240successfully matched A and IFMATCH_A_fail is a state saying that we have
3241just failed to match A. Resume states always come in pairs. The backtrack
3242state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3243at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3244on success or failure.
3245
3246The struct that holds a backtracking state is actually a big union, with
3247one variant for each major type of op. The variable st points to the
3248top-most backtrack struct. To make the code clearer, within each
3249block of code we #define ST to alias the relevant union.
3250
3251Here's a concrete example of a (vastly oversimplified) IFMATCH
3252implementation:
3253
3254 switch (state) {
3255 ....
3256
3257#define ST st->u.ifmatch
3258
3259 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3260 ST.foo = ...; // some state we wish to save
95b24440 3261 ...
bf1f174e
DM
3262 // push a yes backtrack state with a resume value of
3263 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3264 // first node of A:
4d5016e5 3265 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3266 // NOTREACHED
3267
3268 case IFMATCH_A: // we have successfully executed A; now continue with B
3269 next = B;
3270 bar = ST.foo; // do something with the preserved value
3271 break;
3272
3273 case IFMATCH_A_fail: // A failed, so the assertion failed
3274 ...; // do some housekeeping, then ...
3275 sayNO; // propagate the failure
3276
3277#undef ST
95b24440 3278
bf1f174e
DM
3279 ...
3280 }
95b24440 3281
bf1f174e
DM
3282For any old-timers reading this who are familiar with the old recursive
3283approach, the code above is equivalent to:
95b24440 3284
bf1f174e
DM
3285 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3286 {
3287 int foo = ...
95b24440 3288 ...
bf1f174e
DM
3289 if (regmatch(A)) {
3290 next = B;
3291 bar = foo;
3292 break;
95b24440 3293 }
bf1f174e
DM
3294 ...; // do some housekeeping, then ...
3295 sayNO; // propagate the failure
95b24440 3296 }
bf1f174e
DM
3297
3298The topmost backtrack state, pointed to by st, is usually free. If you
3299want to claim it, populate any ST.foo fields in it with values you wish to
3300save, then do one of
3301
4d5016e5
DM
3302 PUSH_STATE_GOTO(resume_state, node, newinput);
3303 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3304
3305which sets that backtrack state's resume value to 'resume_state', pushes a
3306new free entry to the top of the backtrack stack, then goes to 'node'.
3307On backtracking, the free slot is popped, and the saved state becomes the
3308new free state. An ST.foo field in this new top state can be temporarily
3309accessed to retrieve values, but once the main loop is re-entered, it
3310becomes available for reuse.
3311
3312Note that the depth of the backtrack stack constantly increases during the
3313left-to-right execution of the pattern, rather than going up and down with
3314the pattern nesting. For example the stack is at its maximum at Z at the
3315end of the pattern, rather than at X in the following:
3316
3317 /(((X)+)+)+....(Y)+....Z/
3318
3319The only exceptions to this are lookahead/behind assertions and the cut,
3320(?>A), which pop all the backtrack states associated with A before
3321continuing.
3322
486ec47a 3323Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3324PL_regmatch_state and st always point to the currently active state,
3325and PL_regmatch_slab points to the slab currently containing
3326PL_regmatch_state. The first time regmatch() is called, the first slab is
3327allocated, and is never freed until interpreter destruction. When the slab
3328is full, a new one is allocated and chained to the end. At exit from
3329regmatch(), slabs allocated since entry are freed.
3330
3331*/
95b24440 3332
40a82448 3333
5bc10b2c 3334#define DEBUG_STATE_pp(pp) \
265c4333 3335 DEBUG_STATE_r({ \
baa60164 3336 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3337 PerlIO_printf(Perl_debug_log, \
5d458dd8 3338 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3339 depth*2, "", \
baa60164 3340 PL_reg_name[st->resume_state], \
5d458dd8
YO
3341 ((st==yes_state||st==mark_state) ? "[" : ""), \
3342 ((st==yes_state) ? "Y" : ""), \
3343 ((st==mark_state) ? "M" : ""), \
3344 ((st==yes_state||st==mark_state) ? "]" : "") \
3345 ); \
265c4333 3346 });
5bc10b2c 3347
40a82448 3348
3dab1dad 3349#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3350
3df15adc 3351#ifdef DEBUGGING
5bc10b2c 3352
ab3bbdeb 3353STATIC void
f2ed9b32 3354S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3355 const char *start, const char *end, const char *blurb)
3356{
efd26800 3357 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3358
3359 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3360
ab3bbdeb
YO
3361 if (!PL_colorset)
3362 reginitcolors();
3363 {
3364 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3365 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3366
f2ed9b32 3367 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3368 start, end - start, 60);
3369
3370 PerlIO_printf(Perl_debug_log,
3371 "%s%s REx%s %s against %s\n",
3372 PL_colors[4], blurb, PL_colors[5], s0, s1);
3373
f2ed9b32 3374 if (utf8_target||utf8_pat)
1de06328
YO
3375 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3376 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3377 utf8_pat && utf8_target ? " and " : "",
3378 utf8_target ? "string" : ""
ab3bbdeb
YO
3379 );
3380 }
3381}
3df15adc
YO
3382
3383STATIC void
786e8c11
YO
3384S_dump_exec_pos(pTHX_ const char *locinput,
3385 const regnode *scan,
3386 const char *loc_regeol,
3387 const char *loc_bostr,
3388 const char *loc_reg_starttry,
f2ed9b32 3389 const bool utf8_target)
07be1b83 3390{
786e8c11 3391 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3392 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3393 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3394 /* The part of the string before starttry has one color
3395 (pref0_len chars), between starttry and current
3396 position another one (pref_len - pref0_len chars),
3397 after the current position the third one.
3398 We assume that pref0_len <= pref_len, otherwise we
3399 decrease pref0_len. */
786e8c11
YO
3400 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3401 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3402 int pref0_len;
3403
7918f24d
NC
3404 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3405
f2ed9b32 3406 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3407 pref_len++;
786e8c11
YO
3408 pref0_len = pref_len - (locinput - loc_reg_starttry);
3409 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3410 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3411 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3412 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3413 l--;
3414 if (pref0_len < 0)
3415 pref0_len = 0;
3416 if (pref0_len > pref_len)
3417 pref0_len = pref_len;
3418 {
f2ed9b32 3419 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3420
ab3bbdeb 3421 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3422 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3423
ab3bbdeb 3424 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3425 (locinput - pref_len + pref0_len),
1de06328 3426 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3427
ab3bbdeb 3428 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3429 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3430
1de06328 3431 const STRLEN tlen=len0+len1+len2;
3df15adc 3432 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3433 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3434 (IV)(locinput - loc_bostr),
07be1b83 3435 len0, s0,
07be1b83 3436 len1, s1,
07be1b83 3437 (docolor ? "" : "> <"),
07be1b83 3438 len2, s2,
f9f4320a 3439 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3440 "");
3441 }
3442}
3df15adc 3443
07be1b83
YO
3444#endif
3445
0a4db386
YO
3446/* reg_check_named_buff_matched()
3447 * Checks to see if a named buffer has matched. The data array of
3448 * buffer numbers corresponding to the buffer is expected to reside
3449 * in the regexp->data->data array in the slot stored in the ARG() of
3450 * node involved. Note that this routine doesn't actually care about the
3451 * name, that information is not preserved from compilation to execution.
3452 * Returns the index of the leftmost defined buffer with the given name
3453 * or 0 if non of the buffers matched.
3454 */
3455STATIC I32
7918f24d
NC
3456S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3457{
0a4db386 3458 I32 n;
f8fc2ecf 3459 RXi_GET_DECL(rex,rexi);
ad64d0ec 3460 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3461 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3462
3463 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3464
0a4db386 3465 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3466 if ((I32)rex->lastparen >= nums[n] &&
3467 rex->offs[nums[n]].end != -1)
0a4db386
YO
3468 {
3469 return nums[n];
3470 }
3471 }
3472 return 0;
3473}
3474
2f554ef7 3475
c74f6de9 3476static bool
984e6dd1 3477S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3478 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3479{
79a2a0e8
KW
3480 /* This function determines if there are one or two characters that match
3481 * the first character of the passed-in EXACTish node <text_node>, and if
3482 * so, returns them in the passed-in pointers.
c74f6de9 3483 *
79a2a0e8
KW
3484 * If it determines that no possible character in the target string can
3485 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3486 * the first character in <text_node> requires UTF-8 to represent, and the
3487 * target string isn't in UTF-8.)
c74f6de9 3488 *
79a2a0e8
KW
3489 * If there are more than two characters that could match the beginning of
3490 * <text_node>, or if more context is required to determine a match or not,
3491 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3492 *
3493 * The motiviation behind this function is to allow the caller to set up
3494 * tight loops for matching. If <text_node> is of type EXACT, there is
3495 * only one possible character that can match its first character, and so
3496 * the situation is quite simple. But things get much more complicated if
3497 * folding is involved. It may be that the first character of an EXACTFish
3498 * node doesn't participate in any possible fold, e.g., punctuation, so it
3499 * can be matched only by itself. The vast majority of characters that are
3500 * in folds match just two things, their lower and upper-case equivalents.
3501 * But not all are like that; some have multiple possible matches, or match
3502 * sequences of more than one character. This function sorts all that out.
3503 *
3504 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3505 * loop of trying to match A*, we know we can't exit where the thing
3506 * following it isn't a B. And something can't be a B unless it is the
3507 * beginning of B. By putting a quick test for that beginning in a tight
3508 * loop, we can rule out things that can't possibly be B without having to
3509 * break out of the loop, thus avoiding work. Similarly, if A is a single
3510 * character, we can make a tight loop matching A*, using the outputs of
3511 * this function.
3512 *
3513 * If the target string to match isn't in UTF-8, and there aren't
3514 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3515 * the one or two possible octets (which are characters in this situation)
3516 * that can match. In all cases, if there is only one character that can
3517 * match, *<c1p> and *<c2p> will be identical.
3518 *
3519 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3520 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3521 * can match the beginning of <text_node>. They should be declared with at
3522 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3523 * undefined what these contain.) If one or both of the buffers are
3524 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3525 * corresponding invariant. If variant,