This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re_intuit_start(): in MBOL block, eliminate t var
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
4ac71550
TC
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
81e983c1 83#include "inline_invlist.c"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
e1cf74e3
CB
86#ifdef DEBUGGING
87/* At least one required character in the target string is expressible only in
88 * UTF-8. */
89static const char* const non_utf8_target_but_utf8_required
90 = "Can't match, because target string needs to be in UTF-8\n";
91#endif
92
93#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
94 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
95 goto target; \
96} STMT_END
97
c74f6de9
KW
98#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
2d66f61e 104/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
3db24e1e 107#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 108 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 109
c277df42
IZ
110/*
111 * Forwards.
112 */
113
f2ed9b32 114#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 115#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 116
3dab1dad 117#define HOPc(pos,off) \
ba44c216 118 (char *)(reginfo->is_utf8_target \
220db18a 119 ? reghop3((U8*)pos, off, \
9d9163fb 120 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad
YO
121 : (U8*)(pos + off))
122#define HOPBACKc(pos, off) \
ba44c216 123 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
124 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
125 : (pos - off >= reginfo->strbeg) \
8e11feef 126 ? (U8*)pos - off \
3dab1dad 127 : NULL)
efb30f32 128
ba44c216 129#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 130#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 131
8e9f2289
DM
132/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
133 * off must be >=0; args should be vars rather than expressions */
134#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
135 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
136 : (U8*)((pos + off) > lim ? lim : (pos + off)))
137
2974eaec
DM
138#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
139 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
140 : (U8*)(pos + off))
141#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
142
143#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
144#define NEXTCHR_IS_EOS (nextchr < 0)
145
146#define SET_nextchr \
220db18a 147 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
148
149#define SET_locinput(p) \
150 locinput = (p); \
151 SET_nextchr
152
153
2a16ac92 154#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
155 if (!swash_ptr) { \
156 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 157 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 158 1, 0, invlist, &flags); \
c7304fe2
KW
159 assert(swash_ptr); \
160 } \
161 } STMT_END
162
163/* If in debug mode, we test that a known character properly matches */
164#ifdef DEBUGGING
165# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
166 property_name, \
2a16ac92 167 invlist, \
c7304fe2 168 utf8_char_in_property) \
2a16ac92 169 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
170 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
171#else
172# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
173 property_name, \
2a16ac92 174 invlist, \
c7304fe2 175 utf8_char_in_property) \
2a16ac92 176 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 177#endif
d1eb3177 178
c7304fe2
KW
179#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
180 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
181 "", \
182 PL_XPosix_ptrs[_CC_WORDCHAR], \
df38da56 183 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
c7304fe2
KW
184
185#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
186 STMT_START { \
187 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
188 "_X_regular_begin", \
2a16ac92 189 NULL, \
df38da56 190 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
c7304fe2
KW
191 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
192 "_X_extend", \
2a16ac92 193 NULL, \
c7304fe2
KW
194 COMBINING_GRAVE_ACCENT_UTF8); \
195 } STMT_END
d1eb3177 196
c7304fe2 197#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
198/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
199
5f80c4cf 200/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
201/* it would be nice to rework regcomp.sym to generate this stuff. sigh
202 *
203 * NOTE that *nothing* that affects backtracking should be in here, specifically
204 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
205 * node that is in between two EXACT like nodes when ascertaining what the required
206 * "follow" character is. This should probably be moved to regex compile time
207 * although it may be done at run time beause of the REF possibility - more
208 * investigation required. -- demerphq
209*/
baa60164
KW
210#define JUMPABLE(rn) ( \
211 OP(rn) == OPEN || \
3e901dc0 212 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
213 OP(rn) == EVAL || \
214 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
215 OP(rn) == PLUS || OP(rn) == MINMOD || \
216 OP(rn) == KEEPS || \
217 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 218)
ee9b8eae 219#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 220
ee9b8eae
YO
221#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
222
223#if 0
224/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
225 we don't need this definition. */
226#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 227#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
228#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
229
230#else
231/* ... so we use this as its faster. */
232#define IS_TEXT(rn) ( OP(rn)==EXACT )
098b07d5 233#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
234#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
235#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
236
237#endif
e2d8ce26 238
a84d97b6
HS
239/*
240 Search for mandatory following text node; for lookahead, the text must
241 follow but for lookbehind (rn->flags != 0) we skip to the next step.
242*/
baa60164 243#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
244 while (JUMPABLE(rn)) { \
245 const OPCODE type = OP(rn); \
246 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 247 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 248 else if (type == PLUS) \
cca55fe3 249 rn = NEXTOPER(rn); \
3dab1dad 250 else if (type == IFMATCH) \
a84d97b6 251 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 252 else rn += NEXT_OFF(rn); \
3dab1dad 253 } \
5f80c4cf 254} STMT_END
74750237 255
22913b96
KW
256/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
257 * These are for the pre-composed Hangul syllables, which are all in a
258 * contiguous block and arranged there in such a way so as to facilitate
259 * alorithmic determination of their characteristics. As such, they don't need
260 * a swash, but can be determined by simple arithmetic. Almost all are
261 * GCB=LVT, but every 28th one is a GCB=LV */
262#define SBASE 0xAC00 /* Start of block */
263#define SCount 11172 /* Length of block */
264#define TCount 28
c476f425 265
006f26b2
DM
266#define SLAB_FIRST(s) (&(s)->states[0])
267#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
268
a75351a1 269static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 270static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 271static regmatch_state * S_push_slab(pTHX);
51371543 272
87c0511b 273#define REGCP_PAREN_ELEMS 3
f067efbf 274#define REGCP_OTHER_ELEMS 3
e0fa7e2b 275#define REGCP_FRAME_ELEMS 1
620d5b66
NC
276/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
277 * are needed for the regexp context stack bookkeeping. */
278
76e3520e 279STATIC CHECKPOINT
92da3157 280S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 281{
97aff369 282 dVAR;
a3b680e6 283 const int retval = PL_savestack_ix;
92da3157
DM
284 const int paren_elems_to_push =
285 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
286 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
287 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 288 I32 p;
40a82448 289 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 290
b93070ed
DM
291 PERL_ARGS_ASSERT_REGCPPUSH;
292
e49a9654 293 if (paren_elems_to_push < 0)
5637ef5b
NC
294 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
295 paren_elems_to_push);
e49a9654 296
e0fa7e2b
NC
297 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
298 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 299 " out of range (%lu-%ld)",
92da3157
DM
300 total_elems,
301 (unsigned long)maxopenparen,
302 (long)parenfloor);
e0fa7e2b 303
620d5b66 304 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 305
495f47a5 306 DEBUG_BUFFERS_r(
92da3157 307 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
308 PerlIO_printf(Perl_debug_log,
309 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
310 PTR2UV(rex),
311 PTR2UV(rex->offs)
312 );
313 );
92da3157 314 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 315/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
316 SSPUSHIV(rex->offs[p].end);
317 SSPUSHIV(rex->offs[p].start);
1ca2007e 318 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 319 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
320 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
321 (UV)p,
322 (IV)rex->offs[p].start,
323 (IV)rex->offs[p].start_tmp,
324 (IV)rex->offs[p].end
40a82448 325 ));
a0d0e21e 326 }
b1ce53c5 327/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 328 SSPUSHINT(maxopenparen);
b93070ed
DM
329 SSPUSHINT(rex->lastparen);
330 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 331 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 332
a0d0e21e
LW
333 return retval;
334}
335
c277df42 336/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
337#define REGCP_SET(cp) \
338 DEBUG_STATE_r( \
ab3bbdeb 339 PerlIO_printf(Perl_debug_log, \
e4f74956 340 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
341 (IV)PL_savestack_ix)); \
342 cp = PL_savestack_ix
c3464db5 343
ab3bbdeb 344#define REGCP_UNWIND(cp) \
e4f74956 345 DEBUG_STATE_r( \
ab3bbdeb 346 if (cp != PL_savestack_ix) \
e4f74956
YO
347 PerlIO_printf(Perl_debug_log, \
348 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
349 (IV)(cp), (IV)PL_savestack_ix)); \
350 regcpblow(cp)
c277df42 351
a8d1f4b4
DM
352#define UNWIND_PAREN(lp, lcp) \
353 for (n = rex->lastparen; n > lp; n--) \
354 rex->offs[n].end = -1; \
355 rex->lastparen = n; \
356 rex->lastcloseparen = lcp;
357
358
f067efbf 359STATIC void
92da3157 360S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 361{
97aff369 362 dVAR;
e0fa7e2b 363 UV i;
87c0511b 364 U32 paren;
a3621e74
YO
365 GET_RE_DEBUG_FLAGS_DECL;
366
7918f24d
NC
367 PERL_ARGS_ASSERT_REGCPPOP;
368
b1ce53c5 369 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 370 i = SSPOPUV;
e0fa7e2b
NC
371 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
372 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
373 rex->lastcloseparen = SSPOPINT;
374 rex->lastparen = SSPOPINT;
92da3157 375 *maxopenparen_p = SSPOPINT;
b1ce53c5 376
620d5b66 377 i -= REGCP_OTHER_ELEMS;
b1ce53c5 378 /* Now restore the parentheses context. */
495f47a5
DM
379 DEBUG_BUFFERS_r(
380 if (i || rex->lastparen + 1 <= rex->nparens)
381 PerlIO_printf(Perl_debug_log,
382 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
383 PTR2UV(rex),
384 PTR2UV(rex->offs)
385 );
386 );
92da3157 387 paren = *maxopenparen_p;
620d5b66 388 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 389 SSize_t tmps;
1ca2007e 390 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
391 rex->offs[paren].start = SSPOPIV;
392 tmps = SSPOPIV;
b93070ed
DM
393 if (paren <= rex->lastparen)
394 rex->offs[paren].end = tmps;
495f47a5
DM
395 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
396 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
397 (UV)paren,
398 (IV)rex->offs[paren].start,
399 (IV)rex->offs[paren].start_tmp,
400 (IV)rex->offs[paren].end,
401 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 402 );
87c0511b 403 paren--;
a0d0e21e 404 }
daf18116 405#if 1
dafc8851
JH
406 /* It would seem that the similar code in regtry()
407 * already takes care of this, and in fact it is in
408 * a better location to since this code can #if 0-ed out
409 * but the code in regtry() is needed or otherwise tests
410 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
411 * (as of patchlevel 7877) will fail. Then again,
412 * this code seems to be necessary or otherwise
225593e1
DM
413 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
414 * --jhi updated by dapm */
b93070ed 415 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 416 if (i > *maxopenparen_p)
b93070ed
DM
417 rex->offs[i].start = -1;
418 rex->offs[i].end = -1;
495f47a5
DM
419 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
420 " \\%"UVuf": %s ..-1 undeffing\n",
421 (UV)i,
92da3157 422 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 423 ));
a0d0e21e 424 }
dafc8851 425#endif
a0d0e21e
LW
426}
427
74088413
DM
428/* restore the parens and associated vars at savestack position ix,
429 * but without popping the stack */
430
431STATIC void
92da3157 432S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
433{
434 I32 tmpix = PL_savestack_ix;
435 PL_savestack_ix = ix;
92da3157 436 regcppop(rex, maxopenparen_p);
74088413
DM
437 PL_savestack_ix = tmpix;
438}
439
02db2b7b 440#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 441
31c7f561
KW
442STATIC bool
443S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
444{
445 /* Returns a boolean as to whether or not 'character' is a member of the
446 * Posix character class given by 'classnum' that should be equivalent to a
447 * value in the typedef '_char_class_number'.
448 *
449 * Ideally this could be replaced by a just an array of function pointers
450 * to the C library functions that implement the macros this calls.
451 * However, to compile, the precise function signatures are required, and
452 * these may vary from platform to to platform. To avoid having to figure
453 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
454 * which adds an extra layer of function call overhead (unless the C
455 * optimizer strips it away). But we don't particularly care about
456 * performance with locales anyway. */
31c7f561
KW
457
458 switch ((_char_class_number) classnum) {
15861f94 459 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 460 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
461 case _CC_ENUM_ASCII: return isASCII_LC(character);
462 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
463 case _CC_ENUM_CASED: return isLOWER_LC(character)
464 || isUPPER_LC(character);
e8d596e0 465 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
466 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
467 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
468 case _CC_ENUM_LOWER: return isLOWER_LC(character);
469 case _CC_ENUM_PRINT: return isPRINT_LC(character);
e8d596e0 470 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
31c7f561 471 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 472 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
473 case _CC_ENUM_UPPER: return isUPPER_LC(character);
474 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 475 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
476 default: /* VERTSPACE should never occur in locales */
477 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
478 }
479
480 assert(0); /* NOTREACHED */
481 return FALSE;
482}
483
3018b823
KW
484STATIC bool
485S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
486{
487 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
488 * 'character' is a member of the Posix character class given by 'classnum'
489 * that should be equivalent to a value in the typedef
490 * '_char_class_number'.
491 *
492 * This just calls isFOO_lc on the code point for the character if it is in
493 * the range 0-255. Outside that range, all characters avoid Unicode
494 * rules, ignoring any locale. So use the Unicode function if this class
495 * requires a swash, and use the Unicode macro otherwise. */
496
497 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
498
499 if (UTF8_IS_INVARIANT(*character)) {
500 return isFOO_lc(classnum, *character);
501 }
502 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
503 return isFOO_lc(classnum,
94bb8c36 504 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
505 }
506
507 if (classnum < _FIRST_NON_SWASH_CC) {
508
509 /* Initialize the swash unless done already */
510 if (! PL_utf8_swash_ptrs[classnum]) {
511 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
512 PL_utf8_swash_ptrs[classnum] =
513 _core_swash_init("utf8",
514 "",
515 &PL_sv_undef, 1, 0,
516 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
517 }
518
92a2046b
KW
519 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
520 character,
521 TRUE /* is UTF */ ));
3018b823
KW
522 }
523
524 switch ((_char_class_number) classnum) {
525 case _CC_ENUM_SPACE:
526 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
527
528 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
529 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
530 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
531 default: return 0; /* Things like CNTRL are always
532 below 256 */
533 }
534
535 assert(0); /* NOTREACHED */
536 return FALSE;
537}
538
a687059c 539/*
e50aee73 540 * pregexec and friends
a687059c
LW
541 */
542
76234dfb 543#ifndef PERL_IN_XSUB_RE
a687059c 544/*
c277df42 545 - pregexec - match a regexp against a string
a687059c 546 */
c277df42 547I32
5aaab254 548Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 549 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
550/* stringarg: the point in the string at which to begin matching */
551/* strend: pointer to null at end of string */
552/* strbeg: real beginning of string */
553/* minend: end of match must be >= minend bytes after stringarg. */
554/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
555 * itself is accessed via the pointers above */
556/* nosave: For optimizations. */
c277df42 557{
7918f24d
NC
558 PERL_ARGS_ASSERT_PREGEXEC;
559
c277df42 560 return
9041c2e3 561 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
562 nosave ? 0 : REXEC_COPY_STR);
563}
76234dfb 564#endif
22e551b9 565
9041c2e3 566/*
cad2e5aa
JH
567 * Need to implement the following flags for reg_anch:
568 *
569 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
570 * USE_INTUIT_ML
571 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
572 * INTUIT_AUTORITATIVE_ML
573 * INTUIT_ONCE_NOML - Intuit can match in one location only.
574 * INTUIT_ONCE_ML
575 *
576 * Another flag for this function: SECOND_TIME (so that float substrs
577 * with giant delta may be not rechecked).
578 */
579
3f7c398e 580/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
cad2e5aa
JH
581 Otherwise, only SvCUR(sv) is used to get strbeg. */
582
6eb5f6b9
JH
583/* XXXX Some places assume that there is a fixed substring.
584 An update may be needed if optimizer marks as "INTUITable"
585 RExen without fixed substrings. Similarly, it is assumed that
586 lengths of all the strings are no more than minlen, thus they
587 cannot come from lookahead.
40d049e4
YO
588 (Or minlen should take into account lookahead.)
589 NOTE: Some of this comment is not correct. minlen does now take account
590 of lookahead/behind. Further research is required. -- demerphq
591
592*/
6eb5f6b9 593
2c2d71f5
JH
594/* A failure to find a constant substring means that there is no need to make
595 an expensive call to REx engine, thus we celebrate a failure. Similarly,
d8da0584 596 finding a substring too deep into the string means that fewer calls to
30944b6d
IZ
597 regtry() should be needed.
598
599 REx compiler's optimizer found 4 possible hints:
600 a) Anchored substring;
601 b) Fixed substring;
602 c) Whether we are anchored (beginning-of-line or \G);
486ec47a 603 d) First node (of those at offset 0) which may distinguish positions;
6eb5f6b9 604 We use a)b)d) and multiline-part of c), and try to find a position in the
30944b6d
IZ
605 string which does not contradict any of them.
606 */
2c2d71f5 607
6eb5f6b9
JH
608/* Most of decisions we do here should have been done at compile time.
609 The nodes of the REx which we used for the search should have been
610 deleted from the finite automaton. */
611
52a21eb3
DM
612/* args:
613 * rx: the regex to match against
614 * sv: the SV being matched: only used for utf8 flag; the string
615 * itself is accessed via the pointers below. Note that on
616 * something like an overloaded SV, SvPOK(sv) may be false
617 * and the string pointers may point to something unrelated to
618 * the SV itself.
619 * strbeg: real beginning of string
620 * strpos: the point in the string at which to begin matching
621 * strend: pointer to the byte following the last char of the string
622 * flags currently unused; set to 0
623 * data: currently unused; set to NULL
624 */
625
cad2e5aa 626char *
52a21eb3
DM
627Perl_re_intuit_start(pTHX_
628 REGEXP * const rx,
629 SV *sv,
630 const char * const strbeg,
631 char *strpos,
632 char *strend,
633 const U32 flags,
634 re_scream_pos_data *data)
cad2e5aa 635{
97aff369 636 dVAR;
8d919b0a 637 struct regexp *const prog = ReANY(rx);
49f55535 638 SSize_t start_shift = 0;
cad2e5aa 639 /* Should be nonnegative! */
ea3daa5d 640 SSize_t end_shift = 0;
0fc004dd
DM
641 /* current lowest pos in string where the regex can start matching */
642 char *rx_origin = strpos;
eb578fdb 643 SV *check;
f2ed9b32 644 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 645 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 646 bool ml_anch = 0;
8f4bf5fc 647 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 648 char *check_at = NULL; /* check substr found at this pos */
d8080198 649 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
bbe252da 650 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 651 RXi_GET_DECL(prog,progi);
02d5137b
DM
652 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
653 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
654 GET_RE_DEBUG_FLAGS_DECL;
655
7918f24d 656 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
657 PERL_UNUSED_ARG(flags);
658 PERL_UNUSED_ARG(data);
7918f24d 659
1dc475d0
DM
660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
661 "Intuit: trying to determine minimum start position...\n"));
662
fb9bbddb
DM
663 /* for now, assume that all substr offsets are positive. If at some point
664 * in the future someone wants to do clever things with look-behind and
665 * -ve offsets, they'll need to fix up any code in this function
666 * which uses these offsets. See the thread beginning
667 * <20140113145929.GF27210@iabyn.com>
668 */
669 assert(prog->substrs->data[0].min_offset >= 0);
670 assert(prog->substrs->data[0].max_offset >= 0);
671 assert(prog->substrs->data[1].min_offset >= 0);
672 assert(prog->substrs->data[1].max_offset >= 0);
673 assert(prog->substrs->data[2].min_offset >= 0);
674 assert(prog->substrs->data[2].max_offset >= 0);
675
c344f387
JH
676 /* CHR_DIST() would be more correct here but it makes things slow. */
677 if (prog->minlen > strend - strpos) {
a3621e74 678 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 679 " String too short...\n"));
cad2e5aa 680 goto fail;
2c2d71f5 681 }
d8da0584 682
6c3fea77 683 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 684 reginfo->info_aux = NULL;
9d9163fb 685 reginfo->strbeg = strbeg;
220db18a 686 reginfo->strend = strend;
aed7b151 687 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 688 reginfo->intuit = 1;
1cb48e53
DM
689 /* not actually used within intuit, but zero for safety anyway */
690 reginfo->poscache_maxiter = 0;
02d5137b 691
f2ed9b32 692 if (utf8_target) {
33b8afdf
JH
693 if (!prog->check_utf8 && prog->check_substr)
694 to_utf8_substr(prog);
695 check = prog->check_utf8;
696 } else {
7e0d5ad7
KW
697 if (!prog->check_substr && prog->check_utf8) {
698 if (! to_byte_substr(prog)) {
6b54ddc5 699 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
700 }
701 }
33b8afdf
JH
702 check = prog->check_substr;
703 }
274cd312 704
1dc475d0
DM
705 /* dump the various substring data */
706 DEBUG_OPTIMISE_MORE_r({
707 int i;
708 for (i=0; i<=2; i++) {
709 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
710 : prog->substrs->data[i].substr);
711 if (!sv)
712 continue;
713
714 PerlIO_printf(Perl_debug_log,
715 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
716 " useful=%"IVdf" utf8=%d [%s]\n",
717 i,
718 (IV)prog->substrs->data[i].min_offset,
719 (IV)prog->substrs->data[i].max_offset,
720 (IV)prog->substrs->data[i].end_shift,
721 BmUSEFUL(sv),
722 utf8_target ? 1 : 0,
723 SvPEEK(sv));
724 }
725 });
726
8e1490ee 727 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
274cd312
DM
728 /* Check after \n? */
729 ml_anch = ( (prog->intflags & PREGf_ANCH_MBOL)
730 || ((prog->intflags & PREGf_ANCH_BOL) && multiline));
cad2e5aa 731
7e25d62c 732 if (!ml_anch) {
c889ccc8
DM
733 /* we are only allowed to match at BOS or \G */
734
57fcbfa7
DM
735 /* trivially reject if there's a BOS anchor and we're not at BOS.
736 * In the case of \G, we hope(!) that the caller has already
737 * set strpos to pos()-gofs, and will already have checked
738 * that this anchor position is legal. So we can skip it here.
739 */
740 if ( !(prog->intflags & PREGf_ANCH_GPOS)
741 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
742 && (strpos != strbeg))
c889ccc8
DM
743 {
744 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 745 " Not at start...\n"));
c889ccc8
DM
746 goto fail;
747 }
748
a5d12a4b
DM
749 /* in the presence of an anchor, the anchored (relative to the
750 * start of the regex) substr must also be anchored relative
751 * to strpos. So quickly reject if substr isn't found there */
752
c889ccc8
DM
753 if (prog->check_offset_min == prog->check_offset_max
754 && !(prog->intflags & PREGf_CANY_SEEN)
755 && ! multiline) /* /m can cause \n's to match that aren't
756 accounted for in the string max length.
757 See [perl #115242] */
758 {
759 /* Substring at constant offset from beg-of-str... */
d307bf57 760 SSize_t slen = SvCUR(check);
7ef63983 761 char *s;
c889ccc8
DM
762
763 s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 764
1dc475d0
DM
765 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
766 " Looking for check substr at fixed offset %"IVdf"...\n",
767 (IV)prog->check_offset_min));
768
c889ccc8 769 if (SvTAIL(check)) {
d307bf57
DM
770 /* In this case, the regex is anchored at the end too,
771 * so the lengths must match exactly, give or take a \n.
772 * NB: slen >= 1 since the last char of check is \n */
c889ccc8
DM
773 if ( strend - s > slen || strend - s < slen - 1
774 || (strend - s == slen && strend[-1] != '\n'))
775 {
776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 777 " String too long...\n"));
c889ccc8
DM
778 goto fail_finish;
779 }
780 /* Now should match s[0..slen-2] */
781 slen--;
c889ccc8 782 }
d307bf57
DM
783 if (slen && (*SvPVX_const(check) != *s
784 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
785 {
786 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 787 " String not equal...\n"));
d307bf57
DM
788 goto fail_finish;
789 }
c889ccc8
DM
790
791 check_at = s;
792 goto success_at_start;
cad2e5aa 793 }
cad2e5aa 794 }
cad2e5aa 795 }
0fc004dd 796
c0e0ec46
DM
797 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
798 end_shift = prog->check_end_shift;
cad2e5aa 799
19188028 800#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 801 if (end_shift < 0)
1de06328 802 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 803 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
804#endif
805
2c2d71f5 806 restart:
0fc004dd
DM
807 /* Find a candidate regex origin in the region rx_origin..strend
808 * by looking for the "check" substring in that region, corrected by
809 * start/end_shift.
810 */
1de06328
YO
811
812 {
c33e64f0
FC
813 U8* start_point;
814 U8* end_point;
c889ccc8 815
c889ccc8 816 DEBUG_OPTIMISE_MORE_r({
1dc475d0 817 PerlIO_printf(Perl_debug_log,
ae5d4331 818 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 819 " Start shift: %"IVdf" End shift %"IVdf
4d281893 820 " Real end Shift: %"IVdf"\n",
197641b6 821 (IV)(rx_origin - strpos),
c889ccc8 822 (IV)prog->check_offset_min,
12fbc530
DM
823 (IV)start_shift,
824 (IV)end_shift,
c889ccc8
DM
825 (IV)prog->check_end_shift);
826 });
1de06328 827
0d331aaf 828 if (prog->intflags & PREGf_CANY_SEEN) {
0fc004dd 829 start_point= (U8*)(rx_origin + start_shift);
12fbc530 830 end_point= (U8*)(strend - end_shift);
1de06328 831 } else {
0fc004dd 832 start_point= HOP3(rx_origin, start_shift, strend);
12fbc530 833 end_point= HOP3(strend, -end_shift, strbeg);
1de06328 834 }
c889ccc8 835
77656d5b
DM
836 /* if the regex is absolutely anchored to the start of the string,
837 * then check_offset_max represents an upper bound on the string
838 * where the substr could start */
c19c836a
DM
839 if (!ml_anch
840 && prog->intflags & PREGf_ANCH
77656d5b
DM
841 && prog->check_offset_max != SSize_t_MAX
842 && start_shift < prog->check_offset_max)
c19c836a 843 {
1a08ba3a
DM
844 SSize_t len = SvCUR(check) - !!SvTAIL(check);
845 end_point = HOP3lim(start_point,
846 prog->check_offset_max - start_shift,
847 end_point -len)
848 + len;
d6ef1678
DM
849 }
850
6bda09f9 851 DEBUG_OPTIMISE_MORE_r({
1dc475d0 852 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
1de06328 853 (int)(end_point - start_point),
fc8cd66c 854 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
1de06328
YO
855 start_point);
856 });
857
ae5d4331 858 check_at = fbm_instr( start_point, end_point,
7fba1cd6 859 check, multiline ? FBMrf_MULTILINE : 0);
1de06328 860 }
c889ccc8 861
cad2e5aa
JH
862 /* Update the count-of-usability, remove useless subpatterns,
863 unshift s. */
2c2d71f5 864
ab3bbdeb 865 DEBUG_EXECUTE_r({
f2ed9b32 866 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb 867 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1dc475d0 868 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
ae5d4331 869 (check_at ? "Found" : "Did not find"),
f2ed9b32 870 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
ab3bbdeb
YO
871 ? "anchored" : "floating"),
872 quoted,
873 RE_SV_TAIL(check),
ae5d4331 874 (check_at ? " at offset " : "...\n") );
ab3bbdeb 875 });
2c2d71f5 876
ae5d4331 877 if (!check_at)
2c2d71f5 878 goto fail_finish;
2c2d71f5 879 /* Finish the diagnostic message */
197641b6 880 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
2c2d71f5 881
fdc003fd 882 /* set rx_origin to the minimum position where the regex could start
b96863b8
DM
883 * matching, given the constraint of the just-matched check substring.
884 * But don't set it lower than previously.
fdc003fd
DM
885 */
886
b96863b8
DM
887 if (check_at - rx_origin > prog->check_offset_max)
888 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
fdc003fd
DM
889
890
1de06328
YO
891 /* XXX dmq: first branch is for positive lookbehind...
892 Our check string is offset from the beginning of the pattern.
893 So we need to do any stclass tests offset forward from that
894 point. I think. :-(
895 */
896
2c2d71f5
JH
897 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
898 Start with the other substr.
899 XXXX no SCREAM optimization yet - and a very coarse implementation
a0288114 900 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
2c2d71f5
JH
901 *always* match. Probably should be marked during compile...
902 Probably it is right to do no SCREAM here...
903 */
904
6480a6c4
DM
905 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
906 : prog->substrs->data[other_ix].substr)
1de06328 907 {
30944b6d 908 /* Take into account the "other" substring. */
6c3343a6
DM
909 char *last, *last1;
910 char *s;
911 SV* must;
912 struct reg_substr_datum *other;
913
914 do_other_substr:
915 other = &prog->substrs->data[other_ix];
916
917 /* if "other" is anchored:
918 * we've previously found a floating substr starting at check_at.
919 * This means that the regex origin must lie somewhere
920 * between min (rx_origin): HOP3(check_at, -check_offset_max)
921 * and max: HOP3(check_at, -check_offset_min)
922 * (except that min will be >= strpos)
923 * So the fixed substr must lie somewhere between
924 * HOP3(min, anchored_offset)
925 * HOP3(max, anchored_offset) + SvCUR(substr)
926 */
927
928 /* if "other" is floating
929 * Calculate last1, the absolute latest point where the
930 * floating substr could start in the string, ignoring any
931 * constraints from the earlier fixed match. It is calculated
932 * as follows:
933 *
934 * strend - prog->minlen (in chars) is the absolute latest
935 * position within the string where the origin of the regex
936 * could appear. The latest start point for the floating
937 * substr is float_min_offset(*) on from the start of the
938 * regex. last1 simply combines thee two offsets.
939 *
940 * (*) You might think the latest start point should be
941 * float_max_offset from the regex origin, and technically
942 * you'd be correct. However, consider
943 * /a\d{2,4}bcd\w/
944 * Here, float min, max are 3,5 and minlen is 7.
945 * This can match either
946 * /a\d\dbcd\w/
947 * /a\d\d\dbcd\w/
948 * /a\d\d\d\dbcd\w/
949 * In the first case, the regex matches minlen chars; in the
950 * second, minlen+1, in the third, minlen+2.
951 * In the first case, the floating offset is 3 (which equals
952 * float_min), in the second, 4, and in the third, 5 (which
953 * equals float_max). In all cases, the floating string bcd
954 * can never start more than 4 chars from the end of the
955 * string, which equals minlen - float_min. As the substring
956 * starts to match more than float_min from the start of the
957 * regex, it makes the regex match more than minlen chars,
958 * and the two cancel each other out. So we can always use
959 * float_min - minlen, rather than float_max - minlen for the
960 * latest position in the string.
961 *
962 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
963 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
964 */
965
e7a14a9c 966 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
967 last1 = HOP3c(strend,
968 other->min_offset - prog->minlen, strbeg);
969
970 if (other_ix) {
971 /* last is the latest point where the floating substr could
972 * start, *given* any constraints from the earlier fixed
973 * match. This constraint is that the floating string starts
974 * <= float_max_offset chars from the regex origin (rx_origin).
975 * If this value is less than last1, use it instead.
eb3831ce 976 */
6c3343a6
DM
977 assert(rx_origin <= last1);
978 last =
979 /* this condition handles the offset==infinity case, and
980 * is a short-cut otherwise. Although it's comparing a
981 * byte offset to a char length, it does so in a safe way,
982 * since 1 char always occupies 1 or more bytes,
983 * so if a string range is (last1 - rx_origin) bytes,
984 * it will be less than or equal to (last1 - rx_origin)
985 * chars; meaning it errs towards doing the accurate HOP3
986 * rather than just using last1 as a short-cut */
987 (last1 - rx_origin) < other->max_offset
988 ? last1
989 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
990 }
991 else {
992 assert(strpos + start_shift <= check_at);
993 last = HOP4c(check_at, other->min_offset - start_shift,
994 strbeg, strend);
995 }
ead917d0 996
6c3343a6
DM
997 s = HOP3c(rx_origin, other->min_offset, strend);
998 if (s < other_last) /* These positions already checked */
999 s = other_last;
1000
1001 must = utf8_target ? other->utf8_substr : other->substr;
1002 assert(SvPOK(must));
1003 s = fbm_instr(
1004 (unsigned char*)s,
1005 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1006 must,
1007 multiline ? FBMrf_MULTILINE : 0
1008 );
1009 DEBUG_EXECUTE_r({
1010 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1011 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1012 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1013 s ? "Found" : "Contradicts",
1014 other_ix ? "floating" : "anchored",
1015 quoted, RE_SV_TAIL(must));
1016 });
ead917d0 1017
ead917d0 1018
6c3343a6
DM
1019 if (!s) {
1020 /* last1 is latest possible substr location. If we didn't
1021 * find it before there, we never will */
1022 if (last >= last1) {
1023 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1024 ", giving up...\n"));
1025 goto fail_finish;
ead917d0
DM
1026 }
1027
6c3343a6
DM
1028 /* try to find the check substr again at a later
1029 * position. Maybe next time we'll find the "other" substr
1030 * in range too */
1031 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1032 ", trying %s at offset %ld...\n",
1033 (other_ix ? "floating" : "anchored"),
197641b6 1034 (long)(HOP3c(check_at, 1, strend) - strpos)));
6c3343a6
DM
1035
1036 other_last = HOP3c(last, 1, strend) /* highest failure */;
1037 rx_origin =
1038 other_ix
1039 ? HOP3c(rx_origin, 1, strend)
1040 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1041 goto restart;
1042 }
1043 else {
1044 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
197641b6 1045 (long)(s - strpos)));
ead917d0 1046
6c3343a6
DM
1047 if (other_ix) {
1048 /* other_last is set to s, not s+1, since its possible for
1049 * a floating substr to fail first time, then succeed
1050 * second time at the same floating position; e.g.:
1051 * "-AB--AABZ" =~ /\wAB\d*Z/
1052 * The first time round, anchored and float match at
1053 * "-(AB)--AAB(Z)" then fail on the initial \w character
1054 * class. Second time round, they match at "-AB--A(AB)(Z)".
1055 */
1056 other_last = s;
ead917d0
DM
1057 }
1058 else {
6c3343a6
DM
1059 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1060 other_last = HOP3c(s, 1, strend);
ead917d0 1061 }
6c3343a6 1062 }
cad2e5aa 1063 }
acba93e8
DM
1064 else {
1065 DEBUG_OPTIMISE_MORE_r(
1066 PerlIO_printf(Perl_debug_log,
1067 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1068 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
acba93e8
DM
1069 " strend-strpos:%"IVdf"\n",
1070 (IV)prog->check_offset_min,
1071 (IV)prog->check_offset_max,
1c1c599d 1072 (IV)(check_at-strpos),
acba93e8 1073 (IV)(rx_origin-strpos),
1c1c599d 1074 (IV)(rx_origin-check_at),
acba93e8
DM
1075 (IV)(strend-strpos)
1076 )
1077 );
1078 }
2c2d71f5 1079
acba93e8 1080 postprocess_substr_matches:
0991020e 1081
e3c6feb0
DM
1082 /* handle the extra constraint of /^/m */
1083
a71a26b9
DM
1084 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n'
1085 /* May be due to an implicit anchor of m{.*foo} */
1086 && !(prog->intflags & PREGf_IMPLICIT))
e3c6feb0 1087 {
a62659bd
DM
1088 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1089 " looking for /^/m anchor"));
d0880ea7
DM
1090
1091 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1092 * Find the next \n, if any, even if it's beyond the current
1093 * anchored and/or floating substrings. Whether we should be
1094 * scanning ahead for the next \n or the next substr is debatable.
1095 * On the one hand you'd expect rare substrings to appear less
1096 * often than \n's. On the other hand, searching for \n means
1097 * we're effectively flipping been check_substr and "\n" on each
1098 * iteration as the current "rarest" string candidate, which
1099 * means for example that we'll quickly reject the whole string if
1100 * hasn't got a \n, rather than trying every substr position
1101 * first
1102 */
d0880ea7 1103
4ada1233 1104 rx_origin = (char *)memchr(rx_origin, '\n',
ceedd9f2 1105 HOP3c(strend, - prog->minlen, strpos) - rx_origin);
4ada1233 1106 if (!rx_origin) {
d0880ea7
DM
1107 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1108 " Did not find /%s^%s/m...\n",
1109 PL_colors[0], PL_colors[1]));
a62659bd
DM
1110 goto fail_finish;
1111 }
d0880ea7 1112
4ada1233
DM
1113 /* earliest possible origin is 1 char after the \n.
1114 * (since *rx_origin == '\n', it's safe to ++ here rather than
1115 * HOP(rx_origin, 1)) */
1116 rx_origin++;
d0880ea7 1117
f4f115de 1118 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1119 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1120 {
d0880ea7
DM
1121 /* Position contradicts check-string; either because
1122 * check was anchored (and thus has no wiggle room),
4ada1233 1123 * or check was float and rx_origin is above the float range */
d0880ea7
DM
1124 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1125 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
4ada1233 1126 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
d0880ea7
DM
1127 goto restart;
1128 }
1129
1130 /* if we get here, the check substr must have been float,
2e759faa 1131 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1132 * "other" substr which still contradicts */
1133 assert(prog->substrs->check_ix); /* check is float */
1134
1135 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1136 /* whoops, the anchored "other" substr exists, so we still
1137 * contradict. On the other hand, the float "check" substr
1138 * didn't contradict, so just retry the anchored "other"
1139 * substr */
1140 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1141 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1142 PL_colors[0], PL_colors[1],
1143 (long)(rx_origin - strpos),
1144 (long)(rx_origin - strpos + prog->anchored_offset)));
1145 goto do_other_substr;
1146 }
1147
1148 /* success: we don't contradict the found floating substring
1149 * (and there's no anchored substr). */
d0880ea7
DM
1150 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1151 " Found /%s^%s/m at offset %ld...\n",
1152 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
e3c6feb0
DM
1153 }
1154 else {
2e759faa
DM
1155 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1156 " Starting position does not contradict /%s^%s/m...\n",
1157 PL_colors[0], PL_colors[1]));
e3c6feb0
DM
1158 }
1159
1160
1161 /* Decide whether using the substrings helped */
1162
1163 if (rx_origin != strpos) {
1164 /* Fixed substring is found far enough so that the match
1165 cannot start at strpos. */
1166
1167 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
f2ed9b32 1168 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
cad2e5aa
JH
1169 }
1170 else {
f5952150 1171 /* The found string does not prohibit matching at strpos,
2c2d71f5 1172 - no optimization of calling REx engine can be performed,
f5952150
GS
1173 unless it was an MBOL and we are not after MBOL,
1174 or a future STCLASS check will fail this. */
2c2d71f5 1175 success_at_start:
bbe252da 1176 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
f2ed9b32 1177 && (utf8_target ? (
33b8afdf
JH
1178 prog->check_utf8 /* Could be deleted already */
1179 && --BmUSEFUL(prog->check_utf8) < 0
1180 && (prog->check_utf8 == prog->float_utf8)
1181 ) : (
1182 prog->check_substr /* Could be deleted already */
1183 && --BmUSEFUL(prog->check_substr) < 0
1184 && (prog->check_substr == prog->float_substr)
1185 )))
66e933ab 1186 {
cad2e5aa 1187 /* If flags & SOMETHING - do not do it many times on the same match */
1dc475d0 1188 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
f2ed9b32
KW
1189 /* XXX Does the destruction order has to change with utf8_target? */
1190 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1191 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
a0714e2c
SS
1192 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1193 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1194 check = NULL; /* abort */
486ec47a 1195 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
c9415951 1196 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
8e1490ee
YO
1197 if (prog->intflags & PREGf_IMPLICIT) {
1198 prog->intflags &= ~PREGf_ANCH_MBOL;
1199 /* maybe we have no anchors left after this... */
1200 if (!(prog->intflags & PREGf_ANCH))
1201 prog->extflags &= ~RXf_IS_ANCHORED;
1202 }
3cf5c195
IZ
1203 /* XXXX This is a remnant of the old implementation. It
1204 looks wasteful, since now INTUIT can use many
6eb5f6b9 1205 other heuristics. */
bbe252da 1206 prog->extflags &= ~RXf_USE_INTUIT;
c9415951 1207 /* XXXX What other flags might need to be cleared in this branch? */
cad2e5aa 1208 }
cad2e5aa
JH
1209 }
1210
6eb5f6b9
JH
1211 /* Last resort... */
1212 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1de06328
YO
1213 /* trie stclasses are too expensive to use here, we are better off to
1214 leave it to regmatch itself */
f8fc2ecf 1215 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
6eb5f6b9
JH
1216 /* minlen == 0 is possible if regstclass is \b or \B,
1217 and the fixed substr is ''$.
c765d6e0
DM
1218 Since minlen is already taken into account, rx_origin+1 is before strend;
1219 accidentally, minlen >= 1 guaranties no false positives at rx_origin + 1
6eb5f6b9
JH
1220 even for \b or \B. But (minlen? 1 : 0) below assumes that
1221 regstclass does not come from lookahead... */
1222 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
af944926 1223 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
f8fc2ecf 1224 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e
DM
1225 char *t;
1226
2c75e362 1227 /* XXX this value could be pre-computed */
f8fc2ecf 1228 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1229 ? (reginfo->is_utf8_pat
1230 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1231 : STR_LEN(progi->regstclass))
66e933ab 1232 : 1);
1de06328 1233 char * endpos;
efb9848c 1234 char *s = rx_origin;
1de06328
YO
1235 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1236 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1237 else if (prog->float_substr || prog->float_utf8)
1238 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1239 else
1240 endpos= strend;
1241
d8080198
YO
1242 if (checked_upto < s)
1243 checked_upto = s;
1dc475d0
DM
1244 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1245 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
1246 " s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1247 (IV)start_shift, (IV)(check_at - strbeg),
1248 (IV)(s - strbeg), (IV)(endpos - strbeg),
1249 (IV)(checked_upto- strbeg)));
d8080198 1250
6eb5f6b9 1251 t = s;
984e6dd1 1252 s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
f9176b44 1253 reginfo);
d8080198
YO
1254 if (s) {
1255 checked_upto = s;
1256 } else {
6eb5f6b9 1257#ifdef DEBUGGING
cbbf8932 1258 const char *what = NULL;
6eb5f6b9
JH
1259#endif
1260 if (endpos == strend) {
a3621e74 1261 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1262 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1263 goto fail;
1264 }
a3621e74 1265 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1266 " This position contradicts STCLASS...\n") );
8e1490ee 1267 if ((prog->intflags & PREGf_ANCH) && !ml_anch)
653099ff 1268 goto fail;
d8080198 1269 checked_upto = HOPBACKc(endpos, start_shift);
1dc475d0 1270 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
d8080198 1271 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
6eb5f6b9 1272 /* Contradict one of substrings */
33b8afdf 1273 if (prog->anchored_substr || prog->anchored_utf8) {
f2ed9b32 1274 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
a3621e74 1275 DEBUG_EXECUTE_r( what = "anchored" );
6eb5f6b9 1276 hop_and_restart:
1aa99e6b 1277 s = HOP3c(t, 1, strend);
66e933ab
GS
1278 if (s + start_shift + end_shift > strend) {
1279 /* XXXX Should be taken into account earlier? */
a3621e74 1280 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1281 " Could not match STCLASS...\n") );
66e933ab
GS
1282 goto fail;
1283 }
c765d6e0 1284 rx_origin = s;
5e39e1e5
HS
1285 if (!check)
1286 goto giveup;
a3621e74 1287 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1288 " Looking for %s substr starting at offset %ld...\n",
197641b6 1289 what, (long)(rx_origin + start_shift - strpos)) );
6eb5f6b9
JH
1290 goto restart;
1291 }
66e933ab 1292 /* Have both, check_string is floating */
6eb5f6b9
JH
1293 if (t + start_shift >= check_at) /* Contradicts floating=check */
1294 goto retry_floating_check;
1295 /* Recheck anchored substring, but not floating... */
c765d6e0
DM
1296 if (!check) {
1297 rx_origin = NULL;
5e39e1e5 1298 goto giveup;
c765d6e0 1299 }
a3621e74 1300 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1301 " Looking for anchored substr starting at offset %ld...\n",
197641b6 1302 (long)(other_last - strpos)) );
c347b528
DM
1303 assert(prog->substrs->check_ix); /* other is float */
1304 goto do_other_substr;
6eb5f6b9 1305 }
60e71179
GS
1306 /* Another way we could have checked stclass at the
1307 current position only: */
1308 if (ml_anch) {
3043e7b4 1309 s = rx_origin = t + 1;
5e39e1e5
HS
1310 if (!check)
1311 goto giveup;
a3621e74 1312 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1313 " Looking for /%s^%s/m starting at offset %ld...\n",
3043e7b4 1314 PL_colors[0], PL_colors[1],
197641b6 1315 (long)(rx_origin - strpos)) );
acba93e8
DM
1316 /* XXX DAPM I don't yet know why this is true, but the code
1317 * assumed it when it used to do goto try_at_offset */
1318 assert(rx_origin != strpos);
1319 goto postprocess_substr_matches;
66e933ab 1320 }
f2ed9b32 1321 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
60e71179 1322 goto fail;
486ec47a 1323 /* Check is floating substring. */
6eb5f6b9
JH
1324 retry_floating_check:
1325 t = check_at - start_shift;
a3621e74 1326 DEBUG_EXECUTE_r( what = "floating" );
6eb5f6b9
JH
1327 goto hop_and_restart;
1328 }
b7953727 1329 if (t != s) {
a3621e74 1330 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1331 " By STCLASS: moving %ld --> %ld\n",
197641b6 1332 (long)(t - strpos), (long)(s - strpos))
b7953727
JH
1333 );
1334 }
1335 else {
a3621e74 1336 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1337 " Does not contradict STCLASS...\n");
b7953727
JH
1338 );
1339 }
6eb5f6b9 1340 }
5e39e1e5 1341 giveup:
1dc475d0
DM
1342 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Intuit: %s%s:%s match at offset %ld\n",
1343 PL_colors[4], (check ? "Successfully guessed" : "Giving up"),
197641b6 1344 PL_colors[5], (long)(rx_origin - strpos)) );
c765d6e0 1345 return rx_origin;
2c2d71f5
JH
1346
1347 fail_finish: /* Substring not found */
33b8afdf 1348 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1349 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1350 fail:
a3621e74 1351 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1352 PL_colors[4], PL_colors[5]));
bd61b366 1353 return NULL;
cad2e5aa 1354}
9661b544 1355
a0a388a1 1356#define DECL_TRIE_TYPE(scan) \
098b07d5
KW
1357 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1358 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
fab2782b
YO
1359 trie_type = ((scan->flags == EXACT) \
1360 ? (utf8_target ? trie_utf8 : trie_plain) \
098b07d5
KW
1361 : (scan->flags == EXACTFA) \
1362 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1363 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
fab2782b 1364
fd3249ee 1365#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1366STMT_START { \
fab2782b 1367 STRLEN skiplen; \
baa60164 1368 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1369 switch (trie_type) { \
31f05a37 1370 case trie_utf8_exactfa_fold: \
baa60164
KW
1371 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1372 /* FALL THROUGH */ \
fab2782b
YO
1373 case trie_utf8_fold: \
1374 if ( foldlen>0 ) { \
c80e42f3 1375 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1376 foldlen -= len; \
1377 uscan += len; \
1378 len=0; \
1379 } else { \
445bf929 1380 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1381 len = UTF8SKIP(uc); \
1382 skiplen = UNISKIP( uvc ); \
1383 foldlen -= skiplen; \
1384 uscan = foldbuf + skiplen; \
1385 } \
1386 break; \
baa60164
KW
1387 case trie_latin_utf8_exactfa_fold: \
1388 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1389 /* FALL THROUGH */ \
fab2782b
YO
1390 case trie_latin_utf8_fold: \
1391 if ( foldlen>0 ) { \
c80e42f3 1392 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1393 foldlen -= len; \
1394 uscan += len; \
1395 len=0; \
1396 } else { \
1397 len = 1; \
31f05a37 1398 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1399 skiplen = UNISKIP( uvc ); \
1400 foldlen -= skiplen; \
1401 uscan = foldbuf + skiplen; \
1402 } \
1403 break; \
1404 case trie_utf8: \
c80e42f3 1405 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1406 break; \
1407 case trie_plain: \
1408 uvc = (UV)*uc; \
1409 len = 1; \
1410 } \
1411 if (uvc < 256) { \
1412 charid = trie->charmap[ uvc ]; \
1413 } \
1414 else { \
1415 charid = 0; \
1416 if (widecharmap) { \
1417 SV** const svpp = hv_fetch(widecharmap, \
1418 (char*)&uvc, sizeof(UV), 0); \
1419 if (svpp) \
1420 charid = (U16)SvIV(*svpp); \
1421 } \
1422 } \
4cadc6a9
YO
1423} STMT_END
1424
4cadc6a9
YO
1425#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1426STMT_START { \
1427 while (s <= e) { \
1428 if ( (CoNd) \
fac1af77 1429 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1430 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1431 goto got_it; \
1432 s++; \
1433 } \
1434} STMT_END
1435
1436#define REXEC_FBC_UTF8_SCAN(CoDe) \
1437STMT_START { \
9a902117 1438 while (s < strend) { \
4cadc6a9 1439 CoDe \
9a902117 1440 s += UTF8SKIP(s); \
4cadc6a9
YO
1441 } \
1442} STMT_END
1443
1444#define REXEC_FBC_SCAN(CoDe) \
1445STMT_START { \
1446 while (s < strend) { \
1447 CoDe \
1448 s++; \
1449 } \
1450} STMT_END
1451
1452#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1453REXEC_FBC_UTF8_SCAN( \
1454 if (CoNd) { \
02d5137b 1455 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1456 goto got_it; \
1457 else \
1458 tmp = doevery; \
1459 } \
1460 else \
1461 tmp = 1; \
1462)
1463
1464#define REXEC_FBC_CLASS_SCAN(CoNd) \
1465REXEC_FBC_SCAN( \
1466 if (CoNd) { \
02d5137b 1467 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1468 goto got_it; \
1469 else \
1470 tmp = doevery; \
1471 } \
1472 else \
1473 tmp = 1; \
1474)
1475
baa60164 1476#define REXEC_FBC_TRYIT \
02d5137b 1477if ((reginfo->intuit || regtry(reginfo, &s))) \
4cadc6a9
YO
1478 goto got_it
1479
e1d1eefb 1480#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
baa60164 1481 if (utf8_target) { \
e1d1eefb
YO
1482 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1483 } \
1484 else { \
1485 REXEC_FBC_CLASS_SCAN(CoNd); \
d981ef24 1486 }
e1d1eefb 1487
baa60164 1488#define DUMP_EXEC_POS(li,s,doutf8) \
9d9163fb 1489 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
6d59b646 1490 startpos, doutf8)
786e8c11 1491
cfaf538b 1492
baa60164 1493#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1494 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
cfaf538b
KW
1495 tmp = TEST_NON_UTF8(tmp); \
1496 REXEC_FBC_UTF8_SCAN( \
ce1d15d5 1497 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
cfaf538b
KW
1498 tmp = !tmp; \
1499 IF_SUCCESS; \
1500 } \
1501 else { \
1502 IF_FAIL; \
1503 } \
1504 ); \
1505
baa60164 1506#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
9d9163fb 1507 if (s == reginfo->strbeg) { \
cfaf538b
KW
1508 tmp = '\n'; \
1509 } \
1510 else { \
9d9163fb 1511 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
3db24e1e
KW
1512 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1513 0, UTF8_ALLOW_DEFAULT); \
cfaf538b
KW
1514 } \
1515 tmp = TeSt1_UtF8; \
3db24e1e 1516 LOAD_UTF8_CHARCLASS_ALNUM(); \
cfaf538b 1517 REXEC_FBC_UTF8_SCAN( \
3db24e1e 1518 if (tmp == ! (TeSt2_UtF8)) { \
cfaf538b
KW
1519 tmp = !tmp; \
1520 IF_SUCCESS; \
1521 } \
1522 else { \
1523 IF_FAIL; \
1524 } \
1525 ); \
1526
63ac0dad
KW
1527/* The only difference between the BOUND and NBOUND cases is that
1528 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1529 * NBOUND. This is accomplished by passing it in either the if or else clause,
1530 * with the other one being empty */
1531#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1532 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
cfaf538b
KW
1533
1534#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1535 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
63ac0dad
KW
1536
1537#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1538 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b
KW
1539
1540#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1dcf4a1b 1541 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
cfaf538b 1542
63ac0dad
KW
1543
1544/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1545 * be passed in completely with the variable name being tested, which isn't
1546 * such a clean interface, but this is easier to read than it was before. We
1547 * are looking for the boundary (or non-boundary between a word and non-word
1548 * character. The utf8 and non-utf8 cases have the same logic, but the details
1549 * must be different. Find the "wordness" of the character just prior to this
1550 * one, and compare it with the wordness of this one. If they differ, we have
1551 * a boundary. At the beginning of the string, pretend that the previous
1552 * character was a new-line */
baa60164 1553#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1554 if (utf8_target) { \
baa60164 1555 UTF8_CODE \
63ac0dad
KW
1556 } \
1557 else { /* Not utf8 */ \
9d9163fb 1558 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad
KW
1559 tmp = TEST_NON_UTF8(tmp); \
1560 REXEC_FBC_SCAN( \
1561 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1562 tmp = !tmp; \
1563 IF_SUCCESS; \
1564 } \
1565 else { \
1566 IF_FAIL; \
1567 } \
1568 ); \
1569 } \
baa60164 1570 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
63ac0dad
KW
1571 goto got_it;
1572
786e8c11 1573/* We know what class REx starts with. Try to find this position... */
02d5137b 1574/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1575/* annoyingly all the vars in this routine have different names from their counterparts
1576 in regmatch. /grrr */
1577
3c3eec57 1578STATIC char *
07be1b83 1579S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1580 const char *strend, regmatch_info *reginfo)
a687059c 1581{
73104a1b
KW
1582 dVAR;
1583 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1584 char *pat_string; /* The pattern's exactish string */
1585 char *pat_end; /* ptr to end char of pat_string */
1586 re_fold_t folder; /* Function for computing non-utf8 folds */
1587 const U8 *fold_array; /* array for folding ords < 256 */
1588 STRLEN ln;
1589 STRLEN lnc;
73104a1b
KW
1590 U8 c1;
1591 U8 c2;
1592 char *e;
1593 I32 tmp = 1; /* Scratch variable? */
ba44c216 1594 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1595 UV utf8_fold_flags = 0;
f9176b44 1596 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1597 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1598 with a result inverts that result, as 0^1 =
1599 1 and 1^1 = 0 */
1600 _char_class_number classnum;
1601
73104a1b 1602 RXi_GET_DECL(prog,progi);
2f7f8cb1 1603
73104a1b 1604 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1605
73104a1b
KW
1606 /* We know what class it must start with. */
1607 switch (OP(c)) {
1608 case ANYOF:
1609 if (utf8_target) {
1610 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1611 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1612 }
1613 else {
1614 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1615 }
1616 break;
1617 case CANY:
1618 REXEC_FBC_SCAN(
02d5137b 1619 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
73104a1b
KW
1620 goto got_it;
1621 else
1622 tmp = doevery;
1623 );
1624 break;
1625
098b07d5
KW
1626 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1627 assert(! is_utf8_pat);
1628 /* FALL THROUGH */
73104a1b 1629 case EXACTFA:
984e6dd1 1630 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1631 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1632 goto do_exactf_utf8;
1633 }
1634 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1635 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1636 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1637
2fdb7295
KW
1638 case EXACTF: /* This node only generated for non-utf8 patterns */
1639 assert(! is_utf8_pat);
73104a1b 1640 if (utf8_target) {
73104a1b
KW
1641 utf8_fold_flags = 0;
1642 goto do_exactf_utf8;
1643 }
1644 fold_array = PL_fold;
1645 folder = foldEQ;
1646 goto do_exactf_non_utf8;
1647
1648 case EXACTFL:
31f05a37 1649 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1650 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1651 goto do_exactf_utf8;
1652 }
1653 fold_array = PL_fold_locale;
1654 folder = foldEQ_locale;
1655 goto do_exactf_non_utf8;
3c760661 1656
73104a1b 1657 case EXACTFU_SS:
984e6dd1 1658 if (is_utf8_pat) {
73104a1b
KW
1659 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1660 }
1661 goto do_exactf_utf8;
16d951b7 1662
73104a1b 1663 case EXACTFU:
984e6dd1
DM
1664 if (is_utf8_pat || utf8_target) {
1665 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1666 goto do_exactf_utf8;
1667 }
fac1af77 1668
73104a1b
KW
1669 /* Any 'ss' in the pattern should have been replaced by regcomp,
1670 * so we don't have to worry here about this single special case
1671 * in the Latin1 range */
1672 fold_array = PL_fold_latin1;
1673 folder = foldEQ_latin1;
1674
1675 /* FALL THROUGH */
1676
1677 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1678 are no glitches with fold-length differences
1679 between the target string and pattern */
1680
1681 /* The idea in the non-utf8 EXACTF* cases is to first find the
1682 * first character of the EXACTF* node and then, if necessary,
1683 * case-insensitively compare the full text of the node. c1 is the
1684 * first character. c2 is its fold. This logic will not work for
1685 * Unicode semantics and the german sharp ss, which hence should
1686 * not be compiled into a node that gets here. */
1687 pat_string = STRING(c);
1688 ln = STR_LEN(c); /* length to match in octets/bytes */
1689
1690 /* We know that we have to match at least 'ln' bytes (which is the
1691 * same as characters, since not utf8). If we have to match 3
1692 * characters, and there are only 2 availabe, we know without
1693 * trying that it will fail; so don't start a match past the
1694 * required minimum number from the far end */
ea3daa5d 1695 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1696
02d5137b 1697 if (reginfo->intuit && e < s) {
73104a1b
KW
1698 e = s; /* Due to minlen logic of intuit() */
1699 }
fac1af77 1700
73104a1b
KW
1701 c1 = *pat_string;
1702 c2 = fold_array[c1];
1703 if (c1 == c2) { /* If char and fold are the same */
1704 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1705 }
1706 else {
1707 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1708 }
1709 break;
fac1af77 1710
73104a1b
KW
1711 do_exactf_utf8:
1712 {
1713 unsigned expansion;
1714
1715 /* If one of the operands is in utf8, we can't use the simpler folding
1716 * above, due to the fact that many different characters can have the
1717 * same fold, or portion of a fold, or different- length fold */
1718 pat_string = STRING(c);
1719 ln = STR_LEN(c); /* length to match in octets/bytes */
1720 pat_end = pat_string + ln;
984e6dd1 1721 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1722 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1723 : ln;
1724
1725 /* We have 'lnc' characters to match in the pattern, but because of
1726 * multi-character folding, each character in the target can match
1727 * up to 3 characters (Unicode guarantees it will never exceed
1728 * this) if it is utf8-encoded; and up to 2 if not (based on the
1729 * fact that the Latin 1 folds are already determined, and the
1730 * only multi-char fold in that range is the sharp-s folding to
1731 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1732 * string character. Adjust lnc accordingly, rounding up, so that
1733 * if we need to match at least 4+1/3 chars, that really is 5. */
1734 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1735 lnc = (lnc + expansion - 1) / expansion;
1736
1737 /* As in the non-UTF8 case, if we have to match 3 characters, and
1738 * only 2 are left, it's guaranteed to fail, so don't start a
1739 * match that would require us to go beyond the end of the string
1740 */
ea3daa5d 1741 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1742
02d5137b 1743 if (reginfo->intuit && e < s) {
73104a1b
KW
1744 e = s; /* Due to minlen logic of intuit() */
1745 }
0658cdde 1746
73104a1b
KW
1747 /* XXX Note that we could recalculate e to stop the loop earlier,
1748 * as the worst case expansion above will rarely be met, and as we
1749 * go along we would usually find that e moves further to the left.
1750 * This would happen only after we reached the point in the loop
1751 * where if there were no expansion we should fail. Unclear if
1752 * worth the expense */
1753
1754 while (s <= e) {
1755 char *my_strend= (char *)strend;
1756 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1757 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1758 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1759 {
1760 goto got_it;
1761 }
1762 s += (utf8_target) ? UTF8SKIP(s) : 1;
1763 }
1764 break;
1765 }
1766 case BOUNDL:
272d35c9 1767 RXp_MATCH_TAINTED_on(prog);
0eb30aeb 1768 FBC_BOUND(isWORDCHAR_LC,
f4cd282c 1769 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1770 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1771 break;
1772 case NBOUNDL:
272d35c9 1773 RXp_MATCH_TAINTED_on(prog);
0eb30aeb 1774 FBC_NBOUND(isWORDCHAR_LC,
f4cd282c 1775 isWORDCHAR_LC_uvchr(tmp),
0eb30aeb 1776 isWORDCHAR_LC_utf8((U8*)s));
73104a1b
KW
1777 break;
1778 case BOUND:
1779 FBC_BOUND(isWORDCHAR,
0eb30aeb 1780 isWORDCHAR_uni(tmp),
03940dc2 1781 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1782 break;
1783 case BOUNDA:
1784 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1785 isWORDCHAR_A(tmp),
1786 isWORDCHAR_A((U8*)s));
1787 break;
1788 case NBOUND:
1789 FBC_NBOUND(isWORDCHAR,
0eb30aeb 1790 isWORDCHAR_uni(tmp),
03940dc2 1791 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1792 break;
1793 case NBOUNDA:
1794 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1795 isWORDCHAR_A(tmp),
1796 isWORDCHAR_A((U8*)s));
1797 break;
1798 case BOUNDU:
1799 FBC_BOUND(isWORDCHAR_L1,
0eb30aeb 1800 isWORDCHAR_uni(tmp),
03940dc2 1801 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b
KW
1802 break;
1803 case NBOUNDU:
1804 FBC_NBOUND(isWORDCHAR_L1,
0eb30aeb 1805 isWORDCHAR_uni(tmp),
03940dc2 1806 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
73104a1b 1807 break;
73104a1b
KW
1808 case LNBREAK:
1809 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1810 is_LNBREAK_latin1_safe(s, strend)
1811 );
1812 break;
3018b823
KW
1813
1814 /* The argument to all the POSIX node types is the class number to pass to
1815 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1816
1817 case NPOSIXL:
1818 to_complement = 1;
1819 /* FALLTHROUGH */
1820
1821 case POSIXL:
272d35c9 1822 RXp_MATCH_TAINTED_on(prog);
3018b823
KW
1823 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1824 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 1825 break;
3018b823
KW
1826
1827 case NPOSIXD:
1828 to_complement = 1;
1829 /* FALLTHROUGH */
1830
1831 case POSIXD:
1832 if (utf8_target) {
1833 goto posix_utf8;
1834 }
1835 goto posixa;
1836
1837 case NPOSIXA:
1838 if (utf8_target) {
1839 /* The complement of something that matches only ASCII matches all
1840 * UTF-8 variant code points, plus everything in ASCII that isn't
1841 * in the class */
1842 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1843 || ! _generic_isCC_A(*s, FLAGS(c)));
1844 break;
1845 }
1846
1847 to_complement = 1;
1848 /* FALLTHROUGH */
1849
73104a1b 1850 case POSIXA:
3018b823 1851 posixa:
73104a1b 1852 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
1853 * byte invariant character. */
1854 REXEC_FBC_CLASS_SCAN(
1855 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 1856 break;
3018b823
KW
1857
1858 case NPOSIXU:
1859 to_complement = 1;
1860 /* FALLTHROUGH */
1861
1862 case POSIXU:
1863 if (! utf8_target) {
1864 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1865 FLAGS(c))));
1866 }
1867 else {
1868
1869 posix_utf8:
1870 classnum = (_char_class_number) FLAGS(c);
1871 if (classnum < _FIRST_NON_SWASH_CC) {
1872 while (s < strend) {
1873
1874 /* We avoid loading in the swash as long as possible, but
1875 * should we have to, we jump to a separate loop. This
1876 * extra 'if' statement is what keeps this code from being
1877 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1878 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1879 goto found_above_latin1;
1880 }
1881 if ((UTF8_IS_INVARIANT(*s)
1882 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1883 classnum)))
1884 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1885 && to_complement ^ cBOOL(
94bb8c36
KW
1886 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1887 *(s + 1)),
3018b823
KW
1888 classnum))))
1889 {
02d5137b 1890 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
1891 goto got_it;
1892 else {
1893 tmp = doevery;
1894 }
1895 }
1896 else {
1897 tmp = 1;
1898 }
1899 s += UTF8SKIP(s);
1900 }
1901 }
1902 else switch (classnum) { /* These classes are implemented as
1903 macros */
1904 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1905 revert the change of \v matching this */
1906 /* FALL THROUGH */
1907
1908 case _CC_ENUM_PSXSPC:
1909 REXEC_FBC_UTF8_CLASS_SCAN(
1910 to_complement ^ cBOOL(isSPACE_utf8(s)));
1911 break;
1912
1913 case _CC_ENUM_BLANK:
1914 REXEC_FBC_UTF8_CLASS_SCAN(
1915 to_complement ^ cBOOL(isBLANK_utf8(s)));
1916 break;
1917
1918 case _CC_ENUM_XDIGIT:
1919 REXEC_FBC_UTF8_CLASS_SCAN(
1920 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1921 break;
1922
1923 case _CC_ENUM_VERTSPACE:
1924 REXEC_FBC_UTF8_CLASS_SCAN(
1925 to_complement ^ cBOOL(isVERTWS_utf8(s)));
1926 break;
1927
1928 case _CC_ENUM_CNTRL:
1929 REXEC_FBC_UTF8_CLASS_SCAN(
1930 to_complement ^ cBOOL(isCNTRL_utf8(s)));
1931 break;
1932
1933 default:
1934 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1935 assert(0); /* NOTREACHED */
1936 }
1937 }
1938 break;
1939
1940 found_above_latin1: /* Here we have to load a swash to get the result
1941 for the current code point */
1942 if (! PL_utf8_swash_ptrs[classnum]) {
1943 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1944 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
1945 _core_swash_init("utf8",
1946 "",
1947 &PL_sv_undef, 1, 0,
1948 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
1949 }
1950
1951 /* This is a copy of the loop above for swash classes, though using the
1952 * FBC macro instead of being expanded out. Since we've loaded the
1953 * swash, we don't have to check for that each time through the loop */
1954 REXEC_FBC_UTF8_CLASS_SCAN(
1955 to_complement ^ cBOOL(_generic_utf8(
1956 classnum,
1957 s,
1958 swash_fetch(PL_utf8_swash_ptrs[classnum],
1959 (U8 *) s, TRUE))));
73104a1b
KW
1960 break;
1961
1962 case AHOCORASICKC:
1963 case AHOCORASICK:
1964 {
1965 DECL_TRIE_TYPE(c);
1966 /* what trie are we using right now */
1967 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1968 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1969 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1970
1971 const char *last_start = strend - trie->minlen;
6148ee25 1972#ifdef DEBUGGING
73104a1b 1973 const char *real_start = s;
6148ee25 1974#endif
73104a1b
KW
1975 STRLEN maxlen = trie->maxlen;
1976 SV *sv_points;
1977 U8 **points; /* map of where we were in the input string
1978 when reading a given char. For ASCII this
1979 is unnecessary overhead as the relationship
1980 is always 1:1, but for Unicode, especially
1981 case folded Unicode this is not true. */
1982 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1983 U8 *bitmap=NULL;
1984
1985
1986 GET_RE_DEBUG_FLAGS_DECL;
1987
1988 /* We can't just allocate points here. We need to wrap it in
1989 * an SV so it gets freed properly if there is a croak while
1990 * running the match */
1991 ENTER;
1992 SAVETMPS;
1993 sv_points=newSV(maxlen * sizeof(U8 *));
1994 SvCUR_set(sv_points,
1995 maxlen * sizeof(U8 *));
1996 SvPOK_on(sv_points);
1997 sv_2mortal(sv_points);
1998 points=(U8**)SvPV_nolen(sv_points );
1999 if ( trie_type != trie_utf8_fold
2000 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2001 {
2002 if (trie->bitmap)
2003 bitmap=(U8*)trie->bitmap;
2004 else
2005 bitmap=(U8*)ANYOF_BITMAP(c);
2006 }
2007 /* this is the Aho-Corasick algorithm modified a touch
2008 to include special handling for long "unknown char" sequences.
2009 The basic idea being that we use AC as long as we are dealing
2010 with a possible matching char, when we encounter an unknown char
2011 (and we have not encountered an accepting state) we scan forward
2012 until we find a legal starting char.
2013 AC matching is basically that of trie matching, except that when
2014 we encounter a failing transition, we fall back to the current
2015 states "fail state", and try the current char again, a process
2016 we repeat until we reach the root state, state 1, or a legal
2017 transition. If we fail on the root state then we can either
2018 terminate if we have reached an accepting state previously, or
2019 restart the entire process from the beginning if we have not.
2020
2021 */
2022 while (s <= last_start) {
2023 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2024 U8 *uc = (U8*)s;
2025 U16 charid = 0;
2026 U32 base = 1;
2027 U32 state = 1;
2028 UV uvc = 0;
2029 STRLEN len = 0;
2030 STRLEN foldlen = 0;
2031 U8 *uscan = (U8*)NULL;
2032 U8 *leftmost = NULL;
2033#ifdef DEBUGGING
2034 U32 accepted_word= 0;
786e8c11 2035#endif
73104a1b
KW
2036 U32 pointpos = 0;
2037
2038 while ( state && uc <= (U8*)strend ) {
2039 int failed=0;
2040 U32 word = aho->states[ state ].wordnum;
2041
2042 if( state==1 ) {
2043 if ( bitmap ) {
2044 DEBUG_TRIE_EXECUTE_r(
2045 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2046 dump_exec_pos( (char *)uc, c, strend, real_start,
2047 (char *)uc, utf8_target );
2048 PerlIO_printf( Perl_debug_log,
2049 " Scanning for legal start char...\n");
2050 }
2051 );
2052 if (utf8_target) {
2053 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2054 uc += UTF8SKIP(uc);
2055 }
2056 } else {
2057 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2058 uc++;
2059 }
786e8c11 2060 }
73104a1b 2061 s= (char *)uc;
07be1b83 2062 }
73104a1b
KW
2063 if (uc >(U8*)last_start) break;
2064 }
2065
2066 if ( word ) {
2067 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2068 if (!leftmost || lpos < leftmost) {
2069 DEBUG_r(accepted_word=word);
2070 leftmost= lpos;
7016d6eb 2071 }
73104a1b 2072 if (base==0) break;
7016d6eb 2073
73104a1b
KW
2074 }
2075 points[pointpos++ % maxlen]= uc;
2076 if (foldlen || uc < (U8*)strend) {
2077 REXEC_TRIE_READ_CHAR(trie_type, trie,
2078 widecharmap, uc,
2079 uscan, len, uvc, charid, foldlen,
2080 foldbuf, uniflags);
2081 DEBUG_TRIE_EXECUTE_r({
2082 dump_exec_pos( (char *)uc, c, strend,
2083 real_start, s, utf8_target);
2084 PerlIO_printf(Perl_debug_log,
2085 " Charid:%3u CP:%4"UVxf" ",
2086 charid, uvc);
2087 });
2088 }
2089 else {
2090 len = 0;
2091 charid = 0;
2092 }
07be1b83 2093
73104a1b
KW
2094
2095 do {
6148ee25 2096#ifdef DEBUGGING
73104a1b 2097 word = aho->states[ state ].wordnum;
6148ee25 2098#endif
73104a1b
KW
2099 base = aho->states[ state ].trans.base;
2100
2101 DEBUG_TRIE_EXECUTE_r({
2102 if (failed)
2103 dump_exec_pos( (char *)uc, c, strend, real_start,
2104 s, utf8_target );
2105 PerlIO_printf( Perl_debug_log,
2106 "%sState: %4"UVxf", word=%"UVxf,
2107 failed ? " Fail transition to " : "",
2108 (UV)state, (UV)word);
2109 });
2110 if ( base ) {
2111 U32 tmp;
2112 I32 offset;
2113 if (charid &&
2114 ( ((offset = base + charid
2115 - 1 - trie->uniquecharcount)) >= 0)
2116 && ((U32)offset < trie->lasttrans)
2117 && trie->trans[offset].check == state
2118 && (tmp=trie->trans[offset].next))
2119 {
2120 DEBUG_TRIE_EXECUTE_r(
2121 PerlIO_printf( Perl_debug_log," - legal\n"));
2122 state = tmp;
2123 break;
07be1b83
YO
2124 }
2125 else {
786e8c11 2126 DEBUG_TRIE_EXECUTE_r(
73104a1b 2127 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2128 failed = 1;
73104a1b 2129 state = aho->fail[state];
07be1b83 2130 }
07be1b83 2131 }
73104a1b
KW
2132 else {
2133 /* we must be accepting here */
2134 DEBUG_TRIE_EXECUTE_r(
2135 PerlIO_printf( Perl_debug_log," - accepting\n"));
2136 failed = 1;
2137 break;
786e8c11 2138 }
73104a1b
KW
2139 } while(state);
2140 uc += len;
2141 if (failed) {
2142 if (leftmost)
2143 break;
2144 if (!state) state = 1;
07be1b83 2145 }
73104a1b
KW
2146 }
2147 if ( aho->states[ state ].wordnum ) {
2148 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2149 if (!leftmost || lpos < leftmost) {
2150 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2151 leftmost = lpos;
07be1b83
YO
2152 }
2153 }
73104a1b
KW
2154 if (leftmost) {
2155 s = (char*)leftmost;
2156 DEBUG_TRIE_EXECUTE_r({
2157 PerlIO_printf(
2158 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2159 (UV)accepted_word, (IV)(s - real_start)
2160 );
2161 });
02d5137b 2162 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2163 FREETMPS;
2164 LEAVE;
2165 goto got_it;
2166 }
2167 s = HOPc(s,1);
2168 DEBUG_TRIE_EXECUTE_r({
2169 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2170 });
2171 } else {
2172 DEBUG_TRIE_EXECUTE_r(
2173 PerlIO_printf( Perl_debug_log,"No match.\n"));
2174 break;
2175 }
2176 }
2177 FREETMPS;
2178 LEAVE;
2179 }
2180 break;
2181 default:
2182 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2183 break;
2184 }
2185 return 0;
2186 got_it:
2187 return s;
6eb5f6b9
JH
2188}
2189
60165aa4
DM
2190/* set RX_SAVED_COPY, RX_SUBBEG etc.
2191 * flags have same meanings as with regexec_flags() */
2192
749f4950
DM
2193static void
2194S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2195 char *strbeg,
2196 char *strend,
2197 SV *sv,
2198 U32 flags,
2199 bool utf8_target)
2200{
2201 struct regexp *const prog = ReANY(rx);
2202
60165aa4
DM
2203 if (flags & REXEC_COPY_STR) {
2204#ifdef PERL_ANY_COW
2205 if (SvCANCOW(sv)) {
2206 if (DEBUG_C_TEST) {
2207 PerlIO_printf(Perl_debug_log,
2208 "Copy on write: regexp capture, type %d\n",
2209 (int) SvTYPE(sv));
2210 }
5411a0e5
DM
2211 /* Create a new COW SV to share the match string and store
2212 * in saved_copy, unless the current COW SV in saved_copy
2213 * is valid and suitable for our purpose */
2214 if (( prog->saved_copy
2215 && SvIsCOW(prog->saved_copy)
2216 && SvPOKp(prog->saved_copy)
2217 && SvIsCOW(sv)
2218 && SvPOKp(sv)
2219 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2220 {
5411a0e5
DM
2221 /* just reuse saved_copy SV */
2222 if (RXp_MATCH_COPIED(prog)) {
2223 Safefree(prog->subbeg);
2224 RXp_MATCH_COPIED_off(prog);
2225 }
2226 }
2227 else {
2228 /* create new COW SV to share string */
a76b0e90
DM
2229 RX_MATCH_COPY_FREE(rx);
2230 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2231 }
5411a0e5
DM
2232 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2233 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2234 prog->sublen = strend - strbeg;
2235 prog->suboffset = 0;
2236 prog->subcoffset = 0;
2237 } else
2238#endif
2239 {
99a90e59
FC
2240 SSize_t min = 0;
2241 SSize_t max = strend - strbeg;
ea3daa5d 2242 SSize_t sublen;
60165aa4
DM
2243
2244 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2245 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2246 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2247 ) { /* don't copy $' part of string */
2248 U32 n = 0;
2249 max = -1;
2250 /* calculate the right-most part of the string covered
2251 * by a capture. Due to look-ahead, this may be to
2252 * the right of $&, so we have to scan all captures */
2253 while (n <= prog->lastparen) {
2254 if (prog->offs[n].end > max)
2255 max = prog->offs[n].end;
2256 n++;
2257 }
2258 if (max == -1)
2259 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2260 ? prog->offs[0].start
2261 : 0;
2262 assert(max >= 0 && max <= strend - strbeg);
2263 }
2264
2265 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2266 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2267 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2268 ) { /* don't copy $` part of string */
2269 U32 n = 0;
2270 min = max;
2271 /* calculate the left-most part of the string covered
2272 * by a capture. Due to look-behind, this may be to
2273 * the left of $&, so we have to scan all captures */
2274 while (min && n <= prog->lastparen) {
2275 if ( prog->offs[n].start != -1
2276 && prog->offs[n].start < min)
2277 {
2278 min = prog->offs[n].start;
2279 }
2280 n++;
2281 }
2282 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2283 && min > prog->offs[0].end
2284 )
2285 min = prog->offs[0].end;
2286
2287 }
2288
2289 assert(min >= 0 && min <= max && min <= strend - strbeg);
2290 sublen = max - min;
2291
2292 if (RX_MATCH_COPIED(rx)) {
2293 if (sublen > prog->sublen)
2294 prog->subbeg =
2295 (char*)saferealloc(prog->subbeg, sublen+1);
2296 }
2297 else
2298 prog->subbeg = (char*)safemalloc(sublen+1);
2299 Copy(strbeg + min, prog->subbeg, sublen, char);
2300 prog->subbeg[sublen] = '\0';
2301 prog->suboffset = min;
2302 prog->sublen = sublen;
2303 RX_MATCH_COPIED_on(rx);
2304 }
2305 prog->subcoffset = prog->suboffset;
2306 if (prog->suboffset && utf8_target) {
2307 /* Convert byte offset to chars.
2308 * XXX ideally should only compute this if @-/@+
2309 * has been seen, a la PL_sawampersand ??? */
2310
2311 /* If there's a direct correspondence between the
2312 * string which we're matching and the original SV,
2313 * then we can use the utf8 len cache associated with
2314 * the SV. In particular, it means that under //g,
2315 * sv_pos_b2u() will use the previously cached
2316 * position to speed up working out the new length of
2317 * subcoffset, rather than counting from the start of
2318 * the string each time. This stops
2319 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2320 * from going quadratic */
2321 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2322 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2323 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2324 else
2325 prog->subcoffset = utf8_length((U8*)strbeg,
2326 (U8*)(strbeg+prog->suboffset));
2327 }
2328 }
2329 else {
2330 RX_MATCH_COPY_FREE(rx);
2331 prog->subbeg = strbeg;
2332 prog->suboffset = 0;
2333 prog->subcoffset = 0;
2334 prog->sublen = strend - strbeg;
2335 }
2336}
2337
2338
2339
fae667d5 2340
6eb5f6b9
JH
2341/*
2342 - regexec_flags - match a regexp against a string
2343 */
2344I32
5aaab254 2345Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2346 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2347/* stringarg: the point in the string at which to begin matching */
2348/* strend: pointer to null at end of string */
2349/* strbeg: real beginning of string */
2350/* minend: end of match must be >= minend bytes after stringarg. */
2351/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2352 * itself is accessed via the pointers above */
2353/* data: May be used for some additional optimizations.
d058ec57 2354 Currently unused. */
a340edde 2355/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2356
6eb5f6b9 2357{
97aff369 2358 dVAR;
8d919b0a 2359 struct regexp *const prog = ReANY(rx);
5aaab254 2360 char *s;
eb578fdb 2361 regnode *c;
03c83e26 2362 char *startpos;
ea3daa5d
FC
2363 SSize_t minlen; /* must match at least this many chars */
2364 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2365 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2366 I32 multiline;
f8fc2ecf 2367 RXi_GET_DECL(prog,progi);
02d5137b
DM
2368 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2369 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2370 regexp_paren_pair *swap = NULL;
006f26b2 2371 I32 oldsave;
a3621e74
YO
2372 GET_RE_DEBUG_FLAGS_DECL;
2373
7918f24d 2374 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2375 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2376
2377 /* Be paranoid... */
03c83e26 2378 if (prog == NULL || stringarg == NULL) {
6eb5f6b9
JH
2379 Perl_croak(aTHX_ "NULL regexp parameter");
2380 return 0;
2381 }
2382
6c3fea77 2383 DEBUG_EXECUTE_r(
03c83e26 2384 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2385 "Matching");
2386 );
8adc0f72 2387
b342a604
DM
2388 startpos = stringarg;
2389
58430ea8 2390 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2391 MAGIC *mg;
2392
fef7148b
DM
2393 /* set reginfo->ganch, the position where \G can match */
2394
2395 reginfo->ganch =
2396 (flags & REXEC_IGNOREPOS)
2397 ? stringarg /* use start pos rather than pos() */
2398 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2399 /* Defined pos(): */
2400 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2401 : strbeg; /* pos() not defined; use start of string */
2402
2403 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2404 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2405
03c83e26
DM
2406 /* in the presence of \G, we may need to start looking earlier in
2407 * the string than the suggested start point of stringarg:
0b2c2a84 2408 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2409 * offset, such as
2410 * /..\G/: gofs = 2
2411 * /ab|c\G/: gofs = 1
2412 * or if the minimum offset isn't known, then we have to go back
2413 * to the start of the string, e.g. /w+\G/
2414 */
2bfbe302 2415
8e1490ee 2416 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2417 startpos = reginfo->ganch - prog->gofs;
2418 if (startpos <
2419 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2420 {
2421 DEBUG_r(PerlIO_printf(Perl_debug_log,
2422 "fail: ganch-gofs before earliest possible start\n"));
2423 return 0;
2424 }
2425 }
2426 else if (prog->gofs) {
b342a604
DM
2427 if (startpos - prog->gofs < strbeg)
2428 startpos = strbeg;
2429 else
2430 startpos -= prog->gofs;
03c83e26 2431 }
58430ea8 2432 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2433 startpos = strbeg;
03c83e26
DM
2434 }
2435
2436 minlen = prog->minlen;
b342a604 2437 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2438 DEBUG_r(PerlIO_printf(Perl_debug_log,
2439 "Regex match can't succeed, so not even tried\n"));
2440 return 0;
2441 }
2442
63a3746a
DM
2443 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2444 * which will call destuctors to reset PL_regmatch_state, free higher
2445 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2446 * regmatch_info_aux_eval */
2447
2448 oldsave = PL_savestack_ix;
2449
dfa77d06
DM
2450 s = startpos;
2451
e322109a 2452 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2453 && !(flags & REXEC_CHECKED))
2454 {
dfa77d06 2455 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2456 flags, NULL);
dfa77d06 2457 if (!s)
7fadf4a7
DM
2458 return 0;
2459
e322109a 2460 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2461 /* we can match based purely on the result of INTUIT.
2462 * Set up captures etc just for $& and $-[0]
2463 * (an intuit-only match wont have $1,$2,..) */
2464 assert(!prog->nparens);
d5e7783a
DM
2465
2466 /* s/// doesn't like it if $& is earlier than where we asked it to
2467 * start searching (which can happen on something like /.\G/) */
2468 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2469 && (s < stringarg))
2470 {
2471 /* this should only be possible under \G */
58430ea8 2472 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2473 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2474 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2475 goto phooey;
2476 }
2477
7fadf4a7
DM
2478 /* match via INTUIT shouldn't have any captures.
2479 * Let @-, @+, $^N know */
2480 prog->lastparen = prog->lastcloseparen = 0;
2481 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2482 prog->offs[0].start = s - strbeg;
2483 prog->offs[0].end = utf8_target
2484 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2485 : s - strbeg + prog->minlenret;
7fadf4a7 2486 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2487 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2488 strbeg, strend,
2489 sv, flags, utf8_target);
2490
7fadf4a7
DM
2491 return 1;
2492 }
2493 }
2494
6c3fea77 2495 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2496
dfa77d06 2497 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2498 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2499 "String too short [regexec_flags]...\n"));
2500 goto phooey;
1aa99e6b 2501 }
1de06328 2502
6eb5f6b9 2503 /* Check validity of program. */
f8fc2ecf 2504 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2505 Perl_croak(aTHX_ "corrupted regexp program");
2506 }
2507
272d35c9 2508 RX_MATCH_TAINTED_off(rx);
6eb5f6b9 2509
6c3fea77
DM
2510 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2511 reginfo->intuit = 0;
2512 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2513 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2514 reginfo->warned = FALSE;
9d9163fb 2515 reginfo->strbeg = strbeg;
02d5137b 2516 reginfo->sv = sv;
1cb48e53 2517 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2518 reginfo->strend = strend;
6eb5f6b9 2519 /* see how far we have to get to not match where we matched before */
fe3974be 2520 reginfo->till = stringarg + minend;
6eb5f6b9 2521
82c23608
FC
2522 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) {
2523 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2524 S_cleanup_regmatch_info_aux has executed (registered by
2525 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2526 magic belonging to this SV.
2527 Not newSVsv, either, as it does not COW.
2528 */
2529 reginfo->sv = newSV(0);
4cba5ac0 2530 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2531 SAVEFREESV(reginfo->sv);
2532 }
2533
331b2dcc
DM
2534 /* reserve next 2 or 3 slots in PL_regmatch_state:
2535 * slot N+0: may currently be in use: skip it
2536 * slot N+1: use for regmatch_info_aux struct
2537 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2538 * slot N+3: ready for use by regmatch()
2539 */
bf2039a9 2540
331b2dcc
DM
2541 {
2542 regmatch_state *old_regmatch_state;
2543 regmatch_slab *old_regmatch_slab;
2544 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2545
2546 /* on first ever match, allocate first slab */
2547 if (!PL_regmatch_slab) {
2548 Newx(PL_regmatch_slab, 1, regmatch_slab);
2549 PL_regmatch_slab->prev = NULL;
2550 PL_regmatch_slab->next = NULL;
2551 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2552 }
bf2039a9 2553
331b2dcc
DM
2554 old_regmatch_state = PL_regmatch_state;
2555 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2556
331b2dcc
DM
2557 for (i=0; i <= max; i++) {
2558 if (i == 1)
2559 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2560 else if (i ==2)
2561 reginfo->info_aux_eval =
2562 reginfo->info_aux->info_aux_eval =
2563 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 2564
331b2dcc
DM
2565 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2566 PL_regmatch_state = S_push_slab(aTHX);
2567 }
bf2039a9 2568
331b2dcc
DM
2569 /* note initial PL_regmatch_state position; at end of match we'll
2570 * pop back to there and free any higher slabs */
bf2039a9 2571
331b2dcc
DM
2572 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2573 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 2574 reginfo->info_aux->poscache = NULL;
bf2039a9 2575
331b2dcc 2576 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 2577
331b2dcc
DM
2578 if ((prog->extflags & RXf_EVAL_SEEN))
2579 S_setup_eval_state(aTHX_ reginfo);
2580 else
2581 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 2582 }
d3aa529c 2583
6eb5f6b9 2584 /* If there is a "must appear" string, look for it. */
6eb5f6b9 2585
288b8c02 2586 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
2587 /* We have to be careful. If the previous successful match
2588 was from this regex we don't want a subsequent partially
2589 successful match to clobber the old results.
2590 So when we detect this possibility we add a swap buffer
d8da0584
KW
2591 to the re, and switch the buffer each match. If we fail,
2592 we switch it back; otherwise we leave it swapped.
e9105d30
GG
2593 */
2594 swap = prog->offs;
2595 /* do we need a save destructor here for eval dies? */
2596 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
2597 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2598 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2599 PTR2UV(prog),
2600 PTR2UV(swap),
2601 PTR2UV(prog->offs)
2602 ));
c74340f9 2603 }
6eb5f6b9
JH
2604
2605 /* Simplest case: anchored match need be tried only once. */
2606 /* [unless only anchor is BOL and multiline is set] */
8e1490ee 2607 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3542935d 2608 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9 2609 goto got_it;
8e1490ee 2610 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
6eb5f6b9
JH
2611 {
2612 char *end;
2613
2614 if (minlen)
2615 dontbother = minlen - 1;
1aa99e6b 2616 end = HOP3c(strend, -dontbother, strbeg) - 1;
6eb5f6b9 2617 /* for multiline we only have to try after newlines */
33b8afdf 2618 if (prog->check_substr || prog->check_utf8) {
92f3d482
YO
2619 /* because of the goto we can not easily reuse the macros for bifurcating the
2620 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2621 if (utf8_target) {
2622 if (s == startpos)
2623 goto after_try_utf8;
2624 while (1) {
02d5137b 2625 if (regtry(reginfo, &s)) {
92f3d482
YO
2626 goto got_it;
2627 }
2628 after_try_utf8:
2629 if (s > end) {
2630 goto phooey;
2631 }
2632 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2633 s = re_intuit_start(rx, sv, strbeg,
2634 s + UTF8SKIP(s), strend, flags, NULL);
92f3d482
YO
2635 if (!s) {
2636 goto phooey;
2637 }
2638 }
2639 else {
2640 s += UTF8SKIP(s);
2641 }
2642 }
2643 } /* end search for check string in unicode */
2644 else {
2645 if (s == startpos) {
2646 goto after_try_latin;
2647 }
2648 while (1) {
02d5137b 2649 if (regtry(reginfo, &s)) {
92f3d482
YO
2650 goto got_it;
2651 }
2652 after_try_latin:
2653 if (s > end) {
2654 goto phooey;
2655 }
2656 if (prog->extflags & RXf_USE_INTUIT) {
52a21eb3
DM
2657 s = re_intuit_start(rx, sv, strbeg,
2658 s + 1, strend, flags, NULL);
92f3d482
YO
2659 if (!s) {
2660 goto phooey;
2661 }
2662 }
2663 else {
2664 s++;
2665 }
2666 }
2667 } /* end search for check string in latin*/
2668 } /* end search for check string */
2669 else { /* search for newline */
2670 if (s > startpos) {
2671 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
6eb5f6b9 2672 s--;
92f3d482 2673 }
21eede78
YO
2674 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2675 while (s <= end) { /* note it could be possible to match at the end of the string */
6eb5f6b9 2676 if (*s++ == '\n') { /* don't need PL_utf8skip here */
02d5137b 2677 if (regtry(reginfo, &s))
6eb5f6b9
JH
2678 goto got_it;
2679 }
92f3d482
YO
2680 }
2681 } /* end search for newline */
2682 } /* end anchored/multiline check string search */
6eb5f6b9 2683 goto phooey;
a8430a8b 2684 } else if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 2685 {
a8430a8b
YO
2686 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2687 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
2688 /* For anchored \G, the only position it can match from is
2689 * (ganch-gofs); we already set startpos to this above; if intuit
2690 * moved us on from there, we can't possibly succeed */
2691 assert(startpos == reginfo->ganch - prog->gofs);
2692 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
2693 goto got_it;
2694 goto phooey;
2695 }
2696
2697 /* Messy cases: unanchored match. */
bbe252da 2698 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 2699 /* we have /x+whatever/ */
984e6dd1 2700 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 2701 char ch;
bf93d4cc
GS
2702#ifdef DEBUGGING
2703 int did_match = 0;
2704#endif
f2ed9b32 2705 if (utf8_target) {
7e0d5ad7
KW
2706 if (! prog->anchored_utf8) {
2707 to_utf8_substr(prog);
2708 }
2709 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 2710 REXEC_FBC_SCAN(
6eb5f6b9 2711 if (*s == ch) {
a3621e74 2712 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2713 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2714 s += UTF8SKIP(s);
2715 while (s < strend && *s == ch)
2716 s += UTF8SKIP(s);
2717 }
4cadc6a9 2718 );
7e0d5ad7 2719
6eb5f6b9
JH
2720 }
2721 else {
7e0d5ad7
KW
2722 if (! prog->anchored_substr) {
2723 if (! to_byte_substr(prog)) {
6b54ddc5 2724 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2725 }
2726 }
2727 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 2728 REXEC_FBC_SCAN(
6eb5f6b9 2729 if (*s == ch) {
a3621e74 2730 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 2731 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
2732 s++;
2733 while (s < strend && *s == ch)
2734 s++;
2735 }
4cadc6a9 2736 );
6eb5f6b9 2737 }
a3621e74 2738 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 2739 PerlIO_printf(Perl_debug_log,
b7953727
JH
2740 "Did not find anchored character...\n")
2741 );
6eb5f6b9 2742 }
a0714e2c
SS
2743 else if (prog->anchored_substr != NULL
2744 || prog->anchored_utf8 != NULL
2745 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
2746 && prog->float_max_offset < strend - s)) {
2747 SV *must;
ea3daa5d
FC
2748 SSize_t back_max;
2749 SSize_t back_min;
33b8afdf 2750 char *last;
6eb5f6b9 2751 char *last1; /* Last position checked before */
bf93d4cc
GS
2752#ifdef DEBUGGING
2753 int did_match = 0;
2754#endif
33b8afdf 2755 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
2756 if (utf8_target) {
2757 if (! prog->anchored_utf8) {
2758 to_utf8_substr(prog);
2759 }
2760 must = prog->anchored_utf8;
2761 }
2762 else {
2763 if (! prog->anchored_substr) {
2764 if (! to_byte_substr(prog)) {
6b54ddc5 2765 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2766 }
2767 }
2768 must = prog->anchored_substr;
2769 }
33b8afdf
JH
2770 back_max = back_min = prog->anchored_offset;
2771 } else {
7e0d5ad7
KW
2772 if (utf8_target) {
2773 if (! prog->float_utf8) {
2774 to_utf8_substr(prog);
2775 }
2776 must = prog->float_utf8;
2777 }
2778 else {
2779 if (! prog->float_substr) {
2780 if (! to_byte_substr(prog)) {
6b54ddc5 2781 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2782 }
2783 }
2784 must = prog->float_substr;
2785 }
33b8afdf
JH
2786 back_max = prog->float_max_offset;
2787 back_min = prog->float_min_offset;
2788 }
1de06328 2789
1de06328
YO
2790 if (back_min<0) {
2791 last = strend;
2792 } else {
2793 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 2794 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
2795 - (SvTAIL(must) != 0) + back_min), strbeg);
2796 }
9d9163fb 2797 if (s > reginfo->strbeg)
6eb5f6b9
JH
2798 last1 = HOPc(s, -1);
2799 else
2800 last1 = s - 1; /* bogus */
2801
a0288114 2802 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 2803 check_substr==must. */
bf05793b 2804 dontbother = 0;
6eb5f6b9
JH
2805 strend = HOPc(strend, -dontbother);
2806 while ( (s <= last) &&
e50d57d4 2807 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 2808 (unsigned char*)strend, must,
c33e64f0 2809 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 2810 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
2811 if (HOPc(s, -back_max) > last1) {
2812 last1 = HOPc(s, -back_min);
2813 s = HOPc(s, -back_max);
2814 }
2815 else {
9d9163fb
DM
2816 char * const t = (last1 >= reginfo->strbeg)
2817 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
2818
2819 last1 = HOPc(s, -back_min);
52657f30 2820 s = t;
6eb5f6b9 2821 }
f2ed9b32 2822 if (utf8_target) {
6eb5f6b9 2823 while (s <= last1) {
02d5137b 2824 if (regtry(reginfo, &s))
6eb5f6b9 2825 goto got_it;
7016d6eb
DM
2826 if (s >= last1) {
2827 s++; /* to break out of outer loop */
2828 break;
2829 }
2830 s += UTF8SKIP(s);
6eb5f6b9
JH
2831 }
2832 }
2833 else {
2834 while (s <= last1) {
02d5137b 2835 if (regtry(reginfo, &s))
6eb5f6b9
JH
2836 goto got_it;
2837 s++;
2838 }
2839 }
2840 }
ab3bbdeb 2841 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 2842 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
2843 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2844 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 2845 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 2846 ? "anchored" : "floating"),
ab3bbdeb
YO
2847 quoted, RE_SV_TAIL(must));
2848 });
6eb5f6b9
JH
2849 goto phooey;
2850 }
f8fc2ecf 2851 else if ( (c = progi->regstclass) ) {
f14c76ed 2852 if (minlen) {
f8fc2ecf 2853 const OPCODE op = OP(progi->regstclass);
66e933ab 2854 /* don't bother with what can't match */
786e8c11 2855 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
f14c76ed
RGS
2856 strend = HOPc(strend, -(minlen - 1));
2857 }
a3621e74 2858 DEBUG_EXECUTE_r({
be8e71aa 2859 SV * const prop = sv_newmortal();
32fc9b6a 2860 regprop(prog, prop, c);
0df25f3d 2861 {
f2ed9b32 2862 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 2863 s,strend-s,60);
0df25f3d 2864 PerlIO_printf(Perl_debug_log,
1c8f8eb1 2865 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 2866 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 2867 quoted, (int)(strend - s));
0df25f3d 2868 }
ffc61ed2 2869 });
f9176b44 2870 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 2871 goto got_it;
07be1b83 2872 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
2873 }
2874 else {
2875 dontbother = 0;
a0714e2c 2876 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 2877 /* Trim the end. */
6af40bd7 2878 char *last= NULL;
33b8afdf 2879 SV* float_real;
c33e64f0
FC
2880 STRLEN len;
2881 const char *little;
33b8afdf 2882
7e0d5ad7
KW
2883 if (utf8_target) {
2884 if (! prog->float_utf8) {
2885 to_utf8_substr(prog);
2886 }
2887 float_real = prog->float_utf8;
2888 }
2889 else {
2890 if (! prog->float_substr) {
2891 if (! to_byte_substr(prog)) {
6b54ddc5 2892 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
2893 }
2894 }
2895 float_real = prog->float_substr;
2896 }
d6a28714 2897
c33e64f0
FC
2898 little = SvPV_const(float_real, len);
2899 if (SvTAIL(float_real)) {
7f18ad16
KW
2900 /* This means that float_real contains an artificial \n on
2901 * the end due to the presence of something like this:
2902 * /foo$/ where we can match both "foo" and "foo\n" at the
2903 * end of the string. So we have to compare the end of the
2904 * string first against the float_real without the \n and
2905 * then against the full float_real with the string. We
2906 * have to watch out for cases where the string might be
2907 * smaller than the float_real or the float_real without
2908 * the \n. */
1a13b075
YO
2909 char *checkpos= strend - len;
2910 DEBUG_OPTIMISE_r(
2911 PerlIO_printf(Perl_debug_log,
2912 "%sChecking for float_real.%s\n",
2913 PL_colors[4], PL_colors[5]));
2914 if (checkpos + 1 < strbeg) {
7f18ad16
KW
2915 /* can't match, even if we remove the trailing \n
2916 * string is too short to match */
1a13b075
YO
2917 DEBUG_EXECUTE_r(
2918 PerlIO_printf(Perl_debug_log,
2919 "%sString shorter than required trailing substring, cannot match.%s\n",
2920 PL_colors[4], PL_colors[5]));
2921 goto phooey;
2922 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
2923 /* can match, the end of the string matches without the
2924 * "\n" */
1a13b075
YO
2925 last = checkpos + 1;
2926 } else if (checkpos < strbeg) {
7f18ad16
KW
2927 /* cant match, string is too short when the "\n" is
2928 * included */
1a13b075
YO
2929 DEBUG_EXECUTE_r(
2930 PerlIO_printf(Perl_debug_log,
2931 "%sString does not contain required trailing substring, cannot match.%s\n",
2932 PL_colors[4], PL_colors[5]));
2933 goto phooey;
2934 } else if (!multiline) {
7f18ad16
KW
2935 /* non multiline match, so compare with the "\n" at the
2936 * end of the string */
1a13b075
YO
2937 if (memEQ(checkpos, little, len)) {
2938 last= checkpos;
2939 } else {
2940 DEBUG_EXECUTE_r(
2941 PerlIO_printf(Perl_debug_log,
2942 "%sString does not contain required trailing substring, cannot match.%s\n",
2943 PL_colors[4], PL_colors[5]));
2944 goto phooey;
2945 }
2946 } else {
7f18ad16
KW
2947 /* multiline match, so we have to search for a place
2948 * where the full string is located */
d6a28714 2949 goto find_last;
1a13b075 2950 }
c33e64f0 2951 } else {
d6a28714 2952 find_last:
9041c2e3 2953 if (len)
d6a28714 2954 last = rninstr(s, strend, little, little + len);
b8c5462f 2955 else
a0288114 2956 last = strend; /* matching "$" */
b8c5462f 2957 }
6af40bd7 2958 if (!last) {
7f18ad16
KW
2959 /* at one point this block contained a comment which was
2960 * probably incorrect, which said that this was a "should not
2961 * happen" case. Even if it was true when it was written I am
2962 * pretty sure it is not anymore, so I have removed the comment
2963 * and replaced it with this one. Yves */
6bda09f9
YO
2964 DEBUG_EXECUTE_r(
2965 PerlIO_printf(Perl_debug_log,
6af40bd7
YO
2966 "String does not contain required substring, cannot match.\n"
2967 ));
2968 goto phooey;
bf93d4cc 2969 }
d6a28714
JH
2970 dontbother = strend - last + prog->float_min_offset;
2971 }
2972 if (minlen && (dontbother < minlen))
2973 dontbother = minlen - 1;
2974 strend -= dontbother; /* this one's always in bytes! */
2975 /* We don't know much -- general case. */
f2ed9b32 2976 if (utf8_target) {
d6a28714 2977 for (;;) {
02d5137b 2978 if (regtry(reginfo, &s))
d6a28714
JH
2979 goto got_it;
2980 if (s >= strend)
2981 break;
b8c5462f 2982 s += UTF8SKIP(s);
d6a28714
JH
2983 };
2984 }
2985 else {
2986 do {
02d5137b 2987 if (regtry(reginfo, &s))
d6a28714
JH
2988 goto got_it;
2989 } while (s++ < strend);
2990 }
2991 }
2992
2993 /* Failure. */
2994 goto phooey;
2995
2996got_it:
d5e7783a
DM
2997 /* s/// doesn't like it if $& is earlier than where we asked it to
2998 * start searching (which can happen on something like /.\G/) */
2999 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3000 && (prog->offs[0].start < stringarg - strbeg))
3001 {
3002 /* this should only be possible under \G */
58430ea8 3003 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3004 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3005 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3006 goto phooey;
3007 }
3008
495f47a5
DM
3009 DEBUG_BUFFERS_r(
3010 if (swap)
3011 PerlIO_printf(Perl_debug_log,
3012 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3013 PTR2UV(prog),
3014 PTR2UV(swap)
3015 );
3016 );
e9105d30 3017 Safefree(swap);
d6a28714 3018
bf2039a9
DM
3019 /* clean up; this will trigger destructors that will free all slabs
3020 * above the current one, and cleanup the regmatch_info_aux
3021 * and regmatch_info_aux_eval sructs */
8adc0f72 3022
006f26b2
DM
3023 LEAVE_SCOPE(oldsave);
3024
5daac39c
NC
3025 if (RXp_PAREN_NAMES(prog))
3026 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714 3027
0254aed9
DM
3028 RX_MATCH_UTF8_set(rx, utf8_target);
3029
d6a28714 3030 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3031 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3032 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3033 strbeg, reginfo->strend,
3034 sv, flags, utf8_target);
9041c2e3 3035
d6a28714
JH
3036 return 1;
3037
3038phooey:
a3621e74 3039 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3040 PL_colors[4], PL_colors[5]));
8adc0f72 3041
bf2039a9
DM
3042 /* clean up; this will trigger destructors that will free all slabs
3043 * above the current one, and cleanup the regmatch_info_aux
3044 * and regmatch_info_aux_eval sructs */
8adc0f72 3045
006f26b2
DM
3046 LEAVE_SCOPE(oldsave);
3047
e9105d30 3048 if (swap) {
c74340f9 3049 /* we failed :-( roll it back */
495f47a5
DM
3050 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3051 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3052 PTR2UV(prog),
3053 PTR2UV(prog->offs),
3054 PTR2UV(swap)
3055 ));
e9105d30
GG
3056 Safefree(prog->offs);
3057 prog->offs = swap;
3058 }
d6a28714
JH
3059 return 0;
3060}
3061
6bda09f9 3062
b3d298be 3063/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3064 * Do inc before dec, in case old and new rex are the same */
baa60164 3065#define SET_reg_curpm(Re2) \
bf2039a9 3066 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3067 (void)ReREFCNT_inc(Re2); \
3068 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3069 PM_SETRE((PL_reg_curpm), (Re2)); \
3070 }
3071
3072
d6a28714
JH
3073/*
3074 - regtry - try match at specific point
3075 */
3076STATIC I32 /* 0 failure, 1 success */
f73aaa43 3077S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3078{
97aff369 3079 dVAR;
d6a28714 3080 CHECKPOINT lastcp;
288b8c02 3081 REGEXP *const rx = reginfo->prog;
8d919b0a 3082 regexp *const prog = ReANY(rx);
99a90e59 3083 SSize_t result;
f8fc2ecf 3084 RXi_GET_DECL(prog,progi);
a3621e74 3085 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3086
3087 PERL_ARGS_ASSERT_REGTRY;
3088
24b23f37 3089 reginfo->cutpoint=NULL;
d6a28714 3090
9d9163fb 3091 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3092 prog->lastparen = 0;
03994de8 3093 prog->lastcloseparen = 0;
d6a28714
JH
3094
3095 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3096 to do this again and again, prog->lastparen should take care of
3dd2943c 3097 this! --ilya*/
dafc8851
JH
3098
3099 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3100 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3101 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3102 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3103 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3104 * Meanwhile, this code *is* needed for the
daf18116
JH
3105 * above-mentioned test suite tests to succeed. The common theme
3106 * on those tests seems to be returning null fields from matches.
225593e1 3107 * --jhi updated by dapm */
dafc8851 3108#if 1
d6a28714 3109 if (prog->nparens) {
b93070ed 3110 regexp_paren_pair *pp = prog->offs;
eb578fdb 3111 I32 i;
b93070ed 3112 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3113 ++pp;
3114 pp->start = -1;
3115 pp->end = -1;
d6a28714
JH
3116 }
3117 }
dafc8851 3118#endif
02db2b7b 3119 REGCP_SET(lastcp);
f73aaa43
DM
3120 result = regmatch(reginfo, *startposp, progi->program + 1);
3121 if (result != -1) {
3122 prog->offs[0].end = result;
d6a28714
JH
3123 return 1;
3124 }
24b23f37 3125 if (reginfo->cutpoint)
f73aaa43 3126 *startposp= reginfo->cutpoint;
02db2b7b 3127 REGCP_UNWIND(lastcp);
d6a28714
JH
3128 return 0;
3129}
3130
02db2b7b 3131
8ba1375e
MJD
3132#define sayYES goto yes
3133#define sayNO goto no
262b90c4 3134#define sayNO_SILENT goto no_silent
8ba1375e 3135
f9f4320a
YO
3136/* we dont use STMT_START/END here because it leads to
3137 "unreachable code" warnings, which are bogus, but distracting. */
3138#define CACHEsayNO \
c476f425 3139 if (ST.cache_mask) \
2ac8ff4b 3140 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3141 sayNO
3298f257 3142
a3621e74 3143/* this is used to determine how far from the left messages like
265c4333
YO
3144 'failed...' are printed. It should be set such that messages
3145 are inline with the regop output that created them.
a3621e74 3146*/
265c4333 3147#define REPORT_CODE_OFF 32
a3621e74
YO
3148
3149
40a82448
DM
3150#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3151#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3152#define CHRTEST_NOT_A_CP_1 -999
3153#define CHRTEST_NOT_A_CP_2 -998
9e137952 3154
5d9a96ca
DM
3155/* grab a new slab and return the first slot in it */
3156
3157STATIC regmatch_state *
3158S_push_slab(pTHX)
3159{
a35a87e7 3160#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3161 dMY_CXT;
3162#endif
5d9a96ca
DM
3163 regmatch_slab *s = PL_regmatch_slab->next;
3164 if (!s) {
3165 Newx(s, 1, regmatch_slab);
3166 s->prev = PL_regmatch_slab;
3167 s->next = NULL;
3168 PL_regmatch_slab->next = s;
3169 }
3170 PL_regmatch_slab = s;
86545054 3171 return SLAB_FIRST(s);
5d9a96ca 3172}
5b47454d 3173
95b24440 3174
40a82448
DM
3175/* push a new state then goto it */
3176
4d5016e5
DM
3177#define PUSH_STATE_GOTO(state, node, input) \
3178 pushinput = input; \
40a82448
DM
3179 scan = node; \
3180 st->resume_state = state; \
3181 goto push_state;
3182
3183/* push a new state with success backtracking, then goto it */
3184
4d5016e5
DM
3185#define PUSH_YES_STATE_GOTO(state, node, input) \
3186 pushinput = input; \
40a82448
DM
3187 scan = node; \
3188 st->resume_state = state; \
3189 goto push_yes_state;
3190
aa283a38 3191
aa283a38 3192
4d5016e5 3193
d6a28714 3194/*
95b24440 3195
bf1f174e
DM
3196regmatch() - main matching routine
3197
3198This is basically one big switch statement in a loop. We execute an op,
3199set 'next' to point the next op, and continue. If we come to a point which
3200we may need to backtrack to on failure such as (A|B|C), we push a
3201backtrack state onto the backtrack stack. On failure, we pop the top
3202state, and re-enter the loop at the state indicated. If there are no more
3203states to pop, we return failure.
3204
3205Sometimes we also need to backtrack on success; for example /A+/, where
3206after successfully matching one A, we need to go back and try to
3207match another one; similarly for lookahead assertions: if the assertion
3208completes successfully, we backtrack to the state just before the assertion
3209and then carry on. In these cases, the pushed state is marked as
3210'backtrack on success too'. This marking is in fact done by a chain of
3211pointers, each pointing to the previous 'yes' state. On success, we pop to
3212the nearest yes state, discarding any intermediate failure-only states.
3213Sometimes a yes state is pushed just to force some cleanup code to be
3214called at the end of a successful match or submatch; e.g. (??{$re}) uses
3215it to free the inner regex.
3216
3217Note that failure backtracking rewinds the cursor position, while
3218success backtracking leaves it alone.
3219
3220A pattern is complete when the END op is executed, while a subpattern
3221such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3222ops trigger the "pop to last yes state if any, otherwise return true"
3223behaviour.
3224
3225A common convention in this function is to use A and B to refer to the two
3226subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3227the subpattern to be matched possibly multiple times, while B is the entire
3228rest of the pattern. Variable and state names reflect this convention.
3229
3230The states in the main switch are the union of ops and failure/success of
3231substates associated with with that op. For example, IFMATCH is the op
3232that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3233'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3234successfully matched A and IFMATCH_A_fail is a state saying that we have
3235just failed to match A. Resume states always come in pairs. The backtrack
3236state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3237at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3238on success or failure.
3239
3240The struct that holds a backtracking state is actually a big union, with
3241one variant for each major type of op. The variable st points to the
3242top-most backtrack struct. To make the code clearer, within each
3243block of code we #define ST to alias the relevant union.
3244
3245Here's a concrete example of a (vastly oversimplified) IFMATCH
3246implementation:
3247
3248 switch (state) {
3249 ....
3250
3251#define ST st->u.ifmatch
3252
3253 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3254 ST.foo = ...; // some state we wish to save
95b24440 3255 ...
bf1f174e
DM
3256 // push a yes backtrack state with a resume value of
3257 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3258 // first node of A:
4d5016e5 3259 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3260 // NOTREACHED
3261
3262 case IFMATCH_A: // we have successfully executed A; now continue with B
3263 next = B;
3264 bar = ST.foo; // do something with the preserved value
3265 break;
3266
3267 case IFMATCH_A_fail: // A failed, so the assertion failed
3268 ...; // do some housekeeping, then ...
3269 sayNO; // propagate the failure
3270
3271#undef ST
95b24440 3272
bf1f174e
DM
3273 ...
3274 }
95b24440 3275
bf1f174e
DM
3276For any old-timers reading this who are familiar with the old recursive
3277approach, the code above is equivalent to:
95b24440 3278
bf1f174e
DM
3279 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3280 {
3281 int foo = ...
95b24440 3282 ...
bf1f174e
DM
3283 if (regmatch(A)) {
3284 next = B;
3285 bar = foo;
3286 break;
95b24440 3287 }
bf1f174e
DM
3288 ...; // do some housekeeping, then ...
3289 sayNO; // propagate the failure
95b24440 3290 }
bf1f174e
DM
3291
3292The topmost backtrack state, pointed to by st, is usually free. If you
3293want to claim it, populate any ST.foo fields in it with values you wish to
3294save, then do one of
3295
4d5016e5
DM
3296 PUSH_STATE_GOTO(resume_state, node, newinput);
3297 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3298
3299which sets that backtrack state's resume value to 'resume_state', pushes a
3300new free entry to the top of the backtrack stack, then goes to 'node'.
3301On backtracking, the free slot is popped, and the saved state becomes the
3302new free state. An ST.foo field in this new top state can be temporarily
3303accessed to retrieve values, but once the main loop is re-entered, it
3304becomes available for reuse.
3305
3306Note that the depth of the backtrack stack constantly increases during the
3307left-to-right execution of the pattern, rather than going up and down with
3308the pattern nesting. For example the stack is at its maximum at Z at the
3309end of the pattern, rather than at X in the following:
3310
3311 /(((X)+)+)+....(Y)+....Z/
3312
3313The only exceptions to this are lookahead/behind assertions and the cut,
3314(?>A), which pop all the backtrack states associated with A before
3315continuing.
3316
486ec47a 3317Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3318PL_regmatch_state and st always point to the currently active state,
3319and PL_regmatch_slab points to the slab currently containing
3320PL_regmatch_state. The first time regmatch() is called, the first slab is
3321allocated, and is never freed until interpreter destruction. When the slab
3322is full, a new one is allocated and chained to the end. At exit from
3323regmatch(), slabs allocated since entry are freed.
3324
3325*/
95b24440 3326
40a82448 3327
5bc10b2c 3328#define DEBUG_STATE_pp(pp) \
265c4333 3329 DEBUG_STATE_r({ \
baa60164 3330 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3331 PerlIO_printf(Perl_debug_log, \
5d458dd8 3332 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3333 depth*2, "", \
baa60164 3334 PL_reg_name[st->resume_state], \
5d458dd8
YO
3335 ((st==yes_state||st==mark_state) ? "[" : ""), \
3336 ((st==yes_state) ? "Y" : ""), \
3337 ((st==mark_state) ? "M" : ""), \
3338 ((st==yes_state||st==mark_state) ? "]" : "") \
3339 ); \
265c4333 3340 });
5bc10b2c 3341
40a82448 3342
3dab1dad 3343#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3344
3df15adc 3345#ifdef DEBUGGING
5bc10b2c 3346
ab3bbdeb 3347STATIC void
f2ed9b32 3348S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3349 const char *start, const char *end, const char *blurb)
3350{
efd26800 3351 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3352
3353 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3354
ab3bbdeb
YO
3355 if (!PL_colorset)
3356 reginitcolors();
3357 {
3358 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3359 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3360
f2ed9b32 3361 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3362 start, end - start, 60);
3363
3364 PerlIO_printf(Perl_debug_log,
3365 "%s%s REx%s %s against %s\n",
3366 PL_colors[4], blurb, PL_colors[5], s0, s1);
3367
f2ed9b32 3368 if (utf8_target||utf8_pat)
1de06328
YO
3369 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3370 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3371 utf8_pat && utf8_target ? " and " : "",
3372 utf8_target ? "string" : ""
ab3bbdeb
YO
3373 );
3374 }
3375}
3df15adc
YO
3376
3377STATIC void
786e8c11
YO
3378S_dump_exec_pos(pTHX_ const char *locinput,
3379 const regnode *scan,
3380 const char *loc_regeol,
3381 const char *loc_bostr,
3382 const char *loc_reg_starttry,
f2ed9b32 3383 const bool utf8_target)
07be1b83 3384{
786e8c11 3385 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3386 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3387 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3388 /* The part of the string before starttry has one color
3389 (pref0_len chars), between starttry and current
3390 position another one (pref_len - pref0_len chars),
3391 after the current position the third one.
3392 We assume that pref0_len <= pref_len, otherwise we
3393 decrease pref0_len. */
786e8c11
YO
3394 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3395 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3396 int pref0_len;
3397
7918f24d
NC
3398 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3399
f2ed9b32 3400 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3401 pref_len++;
786e8c11
YO
3402 pref0_len = pref_len - (locinput - loc_reg_starttry);
3403 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3404 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3405 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3406 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3407 l--;
3408 if (pref0_len < 0)
3409 pref0_len = 0;
3410 if (pref0_len > pref_len)
3411 pref0_len = pref_len;
3412 {
f2ed9b32 3413 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
0df25f3d 3414
ab3bbdeb 3415 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3416 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3417
ab3bbdeb 3418 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3419 (locinput - pref_len + pref0_len),
1de06328 3420 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3421
ab3bbdeb 3422 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3423 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3424
1de06328 3425 const STRLEN tlen=len0+len1+len2;
3df15adc 3426 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3427 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3428 (IV)(locinput - loc_bostr),
07be1b83 3429 len0, s0,
07be1b83 3430 len1, s1,
07be1b83 3431 (docolor ? "" : "> <"),
07be1b83 3432 len2, s2,
f9f4320a 3433 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3434 "");
3435 }
3436}
3df15adc 3437
07be1b83
YO
3438#endif
3439
0a4db386
YO
3440/* reg_check_named_buff_matched()
3441 * Checks to see if a named buffer has matched. The data array of
3442 * buffer numbers corresponding to the buffer is expected to reside
3443 * in the regexp->data->data array in the slot stored in the ARG() of
3444 * node involved. Note that this routine doesn't actually care about the
3445 * name, that information is not preserved from compilation to execution.
3446 * Returns the index of the leftmost defined buffer with the given name
3447 * or 0 if non of the buffers matched.
3448 */
3449STATIC I32
7918f24d
NC
3450S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3451{
0a4db386 3452 I32 n;
f8fc2ecf 3453 RXi_GET_DECL(rex,rexi);
ad64d0ec 3454 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3455 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3456
3457 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3458
0a4db386 3459 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3460 if ((I32)rex->lastparen >= nums[n] &&
3461 rex->offs[nums[n]].end != -1)
0a4db386
YO
3462 {
3463 return nums[n];
3464 }
3465 }
3466 return 0;
3467}
3468
2f554ef7 3469
c74f6de9 3470static bool
984e6dd1 3471S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3472 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3473{
79a2a0e8
KW
3474 /* This function determines if there are one or two characters that match
3475 * the first character of the passed-in EXACTish node <text_node>, and if
3476 * so, returns them in the passed-in pointers.
c74f6de9 3477 *
79a2a0e8
KW
3478 * If it determines that no possible character in the target string can
3479 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3480 * the first character in <text_node> requires UTF-8 to represent, and the
3481 * target string isn't in UTF-8.)
c74f6de9 3482 *
79a2a0e8
KW
3483 * If there are more than two characters that could match the beginning of
3484 * <text_node>, or if more context is required to determine a match or not,
3485 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3486 *
3487 * The motiviation behind this function is to allow the caller to set up
3488 * tight loops for matching. If <text_node> is of type EXACT, there is
3489 * only one possible character that can match its first character, and so
3490 * the situation is quite simple. But things get much more complicated if
3491 * folding is involved. It may be that the first character of an EXACTFish
3492 * node doesn't participate in any possible fold, e.g., punctuation, so it
3493 * can be matched only by itself. The vast majority of characters that are
3494 * in folds match just two things, their lower and upper-case equivalents.
3495 * But not all are like that; some have multiple possible matches, or match
3496 * sequences of more than one character. This function sorts all that out.
3497 *
3498 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3499 * loop of trying to match A*, we know we can't exit where the thing
3500 * following it isn't a B. And something can't be a B unless it is the
3501 * beginning of B. By putting a quick test for that beginning in a tight
3502 * loop, we can rule out things that can't possibly be B without having to
3503 * break out of the loop, thus avoiding work. Similarly, if A is a single
3504 * character, we can make a tight loop matching A*, using the outputs of
3505 * this function.
3506 *
3507 * If the target string to match isn't in UTF-8, and there aren't
3508 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3509 * the one or two possible octets (which are characters in this situation)
3510 * that can match. In all cases, if there is only one character that can
3511 * match, *<c1p> and *<c2p> will be identical.
3512 *
3513 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3514 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3515 * can match the beginning of <text_node>. They should be declared with at