This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Various .t files: Use globals to see if on EBCDIC
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
95static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
99#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
101 goto target; \
102} STMT_END
103
c74f6de9
KW
104#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
2d66f61e 110/* Valid only for non-utf8 strings: avoids the reginclass
7e2509c1
KW
111 * call if there are no complications: i.e., if everything matchable is
112 * straight forward in the bitmap */
3db24e1e 113#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
af364d03 114 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 115
c277df42
IZ
116/*
117 * Forwards.
118 */
119
f2ed9b32 120#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
ba44c216 121#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
a0ed51b3 122
3dab1dad 123#define HOPc(pos,off) \
ba44c216 124 (char *)(reginfo->is_utf8_target \
220db18a 125 ? reghop3((U8*)pos, off, \
9d9163fb 126 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 127 : (U8*)(pos + off))
557f47af 128
3dab1dad 129#define HOPBACKc(pos, off) \
ba44c216 130 (char*)(reginfo->is_utf8_target \
9d9163fb
DM
131 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
132 : (pos - off >= reginfo->strbeg) \
8e11feef 133 ? (U8*)pos - off \
3dab1dad 134 : NULL)
efb30f32 135
ba44c216 136#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 137#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 138
557f47af
DM
139/* lim must be +ve. Returns NULL on overshoot */
140#define HOPMAYBE3(pos,off,lim) \
141 (reginfo->is_utf8_target \
142 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
143 : ((U8*)pos + off <= lim) \
144 ? (U8*)pos + off \
145 : NULL)
146
8e9f2289
DM
147/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
148 * off must be >=0; args should be vars rather than expressions */
149#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
150 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
151 : (U8*)((pos + off) > lim ? lim : (pos + off)))
152
2974eaec
DM
153#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
154 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
155 : (U8*)(pos + off))
156#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
157
158#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
159#define NEXTCHR_IS_EOS (nextchr < 0)
160
161#define SET_nextchr \
220db18a 162 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
163
164#define SET_locinput(p) \
165 locinput = (p); \
166 SET_nextchr
167
168
2a16ac92 169#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
170 if (!swash_ptr) { \
171 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 172 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 173 1, 0, invlist, &flags); \
c7304fe2
KW
174 assert(swash_ptr); \
175 } \
176 } STMT_END
177
178/* If in debug mode, we test that a known character properly matches */
179#ifdef DEBUGGING
180# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
181 property_name, \
2a16ac92 182 invlist, \
c7304fe2 183 utf8_char_in_property) \
2a16ac92 184 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
185 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
186#else
187# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
188 property_name, \
2a16ac92 189 invlist, \
c7304fe2 190 utf8_char_in_property) \
2a16ac92 191 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 192#endif
d1eb3177 193
c7304fe2
KW
194#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
195 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
196 "", \
197 PL_XPosix_ptrs[_CC_WORDCHAR], \
0766489e 198 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
c7304fe2 199
c7304fe2 200#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
201/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
202
5f80c4cf 203/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
204/* it would be nice to rework regcomp.sym to generate this stuff. sigh
205 *
206 * NOTE that *nothing* that affects backtracking should be in here, specifically
207 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
208 * node that is in between two EXACT like nodes when ascertaining what the required
209 * "follow" character is. This should probably be moved to regex compile time
210 * although it may be done at run time beause of the REF possibility - more
211 * investigation required. -- demerphq
212*/
baa60164
KW
213#define JUMPABLE(rn) ( \
214 OP(rn) == OPEN || \
3e901dc0 215 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
baa60164
KW
216 OP(rn) == EVAL || \
217 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
218 OP(rn) == PLUS || OP(rn) == MINMOD || \
219 OP(rn) == KEEPS || \
220 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 221)
ee9b8eae 222#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 223
ee9b8eae
YO
224#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
225
226#if 0
227/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
a4525e78 228 we don't need this definition. XXX These are now out-of-sync*/
ee9b8eae 229#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 230#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
231#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
232
233#else
234/* ... so we use this as its faster. */
a4525e78
KW
235#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
236#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
237#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
238#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
239
240#endif
e2d8ce26 241
a84d97b6
HS
242/*
243 Search for mandatory following text node; for lookahead, the text must
244 follow but for lookbehind (rn->flags != 0) we skip to the next step.
245*/
baa60164 246#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
247 while (JUMPABLE(rn)) { \
248 const OPCODE type = OP(rn); \
249 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 250 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 251 else if (type == PLUS) \
cca55fe3 252 rn = NEXTOPER(rn); \
3dab1dad 253 else if (type == IFMATCH) \
a84d97b6 254 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 255 else rn += NEXT_OFF(rn); \
3dab1dad 256 } \
5f80c4cf 257} STMT_END
74750237 258
006f26b2
DM
259#define SLAB_FIRST(s) (&(s)->states[0])
260#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
261
a75351a1 262static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 263static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 264static regmatch_state * S_push_slab(pTHX);
51371543 265
87c0511b 266#define REGCP_PAREN_ELEMS 3
f067efbf 267#define REGCP_OTHER_ELEMS 3
e0fa7e2b 268#define REGCP_FRAME_ELEMS 1
620d5b66
NC
269/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
270 * are needed for the regexp context stack bookkeeping. */
271
76e3520e 272STATIC CHECKPOINT
92da3157 273S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
a0d0e21e 274{
a3b680e6 275 const int retval = PL_savestack_ix;
92da3157
DM
276 const int paren_elems_to_push =
277 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
278 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
279 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 280 I32 p;
40a82448 281 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 282
b93070ed
DM
283 PERL_ARGS_ASSERT_REGCPPUSH;
284
e49a9654 285 if (paren_elems_to_push < 0)
e8a85d26
JH
286 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
287 (int)paren_elems_to_push, (int)maxopenparen,
288 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 289
e0fa7e2b
NC
290 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
291 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
5df417d0 292 " out of range (%lu-%ld)",
92da3157
DM
293 total_elems,
294 (unsigned long)maxopenparen,
295 (long)parenfloor);
e0fa7e2b 296
620d5b66 297 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 298
495f47a5 299 DEBUG_BUFFERS_r(
92da3157 300 if ((int)maxopenparen > (int)parenfloor)
495f47a5
DM
301 PerlIO_printf(Perl_debug_log,
302 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
303 PTR2UV(rex),
304 PTR2UV(rex->offs)
305 );
306 );
92da3157 307 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 308/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
309 SSPUSHIV(rex->offs[p].end);
310 SSPUSHIV(rex->offs[p].start);
1ca2007e 311 SSPUSHINT(rex->offs[p].start_tmp);
e7707071 312 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
495f47a5
DM
313 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
314 (UV)p,
315 (IV)rex->offs[p].start,
316 (IV)rex->offs[p].start_tmp,
317 (IV)rex->offs[p].end
40a82448 318 ));
a0d0e21e 319 }
b1ce53c5 320/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 321 SSPUSHINT(maxopenparen);
b93070ed
DM
322 SSPUSHINT(rex->lastparen);
323 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 324 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 325
a0d0e21e
LW
326 return retval;
327}
328
c277df42 329/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
330#define REGCP_SET(cp) \
331 DEBUG_STATE_r( \
ab3bbdeb 332 PerlIO_printf(Perl_debug_log, \
e4f74956 333 " Setting an EVAL scope, savestack=%"IVdf"\n", \
ab3bbdeb
YO
334 (IV)PL_savestack_ix)); \
335 cp = PL_savestack_ix
c3464db5 336
ab3bbdeb 337#define REGCP_UNWIND(cp) \
e4f74956 338 DEBUG_STATE_r( \
ab3bbdeb 339 if (cp != PL_savestack_ix) \
e4f74956
YO
340 PerlIO_printf(Perl_debug_log, \
341 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
ab3bbdeb
YO
342 (IV)(cp), (IV)PL_savestack_ix)); \
343 regcpblow(cp)
c277df42 344
a8d1f4b4
DM
345#define UNWIND_PAREN(lp, lcp) \
346 for (n = rex->lastparen; n > lp; n--) \
347 rex->offs[n].end = -1; \
348 rex->lastparen = n; \
349 rex->lastcloseparen = lcp;
350
351
f067efbf 352STATIC void
92da3157 353S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
a0d0e21e 354{
e0fa7e2b 355 UV i;
87c0511b 356 U32 paren;
a3621e74
YO
357 GET_RE_DEBUG_FLAGS_DECL;
358
7918f24d
NC
359 PERL_ARGS_ASSERT_REGCPPOP;
360
b1ce53c5 361 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 362 i = SSPOPUV;
e0fa7e2b
NC
363 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
364 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
365 rex->lastcloseparen = SSPOPINT;
366 rex->lastparen = SSPOPINT;
92da3157 367 *maxopenparen_p = SSPOPINT;
b1ce53c5 368
620d5b66 369 i -= REGCP_OTHER_ELEMS;
b1ce53c5 370 /* Now restore the parentheses context. */
495f47a5
DM
371 DEBUG_BUFFERS_r(
372 if (i || rex->lastparen + 1 <= rex->nparens)
373 PerlIO_printf(Perl_debug_log,
374 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
375 PTR2UV(rex),
376 PTR2UV(rex->offs)
377 );
378 );
92da3157 379 paren = *maxopenparen_p;
620d5b66 380 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 381 SSize_t tmps;
1ca2007e 382 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
383 rex->offs[paren].start = SSPOPIV;
384 tmps = SSPOPIV;
b93070ed
DM
385 if (paren <= rex->lastparen)
386 rex->offs[paren].end = tmps;
495f47a5
DM
387 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
388 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
389 (UV)paren,
390 (IV)rex->offs[paren].start,
391 (IV)rex->offs[paren].start_tmp,
392 (IV)rex->offs[paren].end,
393 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 394 );
87c0511b 395 paren--;
a0d0e21e 396 }
daf18116 397#if 1
dafc8851
JH
398 /* It would seem that the similar code in regtry()
399 * already takes care of this, and in fact it is in
400 * a better location to since this code can #if 0-ed out
401 * but the code in regtry() is needed or otherwise tests
402 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
403 * (as of patchlevel 7877) will fail. Then again,
404 * this code seems to be necessary or otherwise
225593e1
DM
405 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
406 * --jhi updated by dapm */
b93070ed 407 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 408 if (i > *maxopenparen_p)
b93070ed
DM
409 rex->offs[i].start = -1;
410 rex->offs[i].end = -1;
495f47a5
DM
411 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
412 " \\%"UVuf": %s ..-1 undeffing\n",
413 (UV)i,
92da3157 414 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 415 ));
a0d0e21e 416 }
dafc8851 417#endif
a0d0e21e
LW
418}
419
74088413
DM
420/* restore the parens and associated vars at savestack position ix,
421 * but without popping the stack */
422
423STATIC void
92da3157 424S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
74088413
DM
425{
426 I32 tmpix = PL_savestack_ix;
427 PL_savestack_ix = ix;
92da3157 428 regcppop(rex, maxopenparen_p);
74088413
DM
429 PL_savestack_ix = tmpix;
430}
431
02db2b7b 432#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 433
31c7f561
KW
434STATIC bool
435S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
436{
437 /* Returns a boolean as to whether or not 'character' is a member of the
438 * Posix character class given by 'classnum' that should be equivalent to a
439 * value in the typedef '_char_class_number'.
440 *
441 * Ideally this could be replaced by a just an array of function pointers
442 * to the C library functions that implement the macros this calls.
443 * However, to compile, the precise function signatures are required, and
444 * these may vary from platform to to platform. To avoid having to figure
445 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
446 * which adds an extra layer of function call overhead (unless the C
447 * optimizer strips it away). But we don't particularly care about
448 * performance with locales anyway. */
31c7f561
KW
449
450 switch ((_char_class_number) classnum) {
15861f94 451 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 452 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
453 case _CC_ENUM_ASCII: return isASCII_LC(character);
454 case _CC_ENUM_BLANK: return isBLANK_LC(character);
b0d691b2
KW
455 case _CC_ENUM_CASED: return isLOWER_LC(character)
456 || isUPPER_LC(character);
e8d596e0 457 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
458 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
459 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
460 case _CC_ENUM_LOWER: return isLOWER_LC(character);
461 case _CC_ENUM_PRINT: return isPRINT_LC(character);
462 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 463 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
464 case _CC_ENUM_UPPER: return isUPPER_LC(character);
465 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 466 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
467 default: /* VERTSPACE should never occur in locales */
468 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
469 }
470
e5964223 471 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
472 return FALSE;
473}
474
3018b823
KW
475STATIC bool
476S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
477{
478 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
479 * 'character' is a member of the Posix character class given by 'classnum'
480 * that should be equivalent to a value in the typedef
481 * '_char_class_number'.
482 *
483 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 484 * the range 0-255. Outside that range, all characters use Unicode
3018b823
KW
485 * rules, ignoring any locale. So use the Unicode function if this class
486 * requires a swash, and use the Unicode macro otherwise. */
487
488 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
489
490 if (UTF8_IS_INVARIANT(*character)) {
491 return isFOO_lc(classnum, *character);
492 }
493 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
494 return isFOO_lc(classnum,
94bb8c36 495 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
496 }
497
613abc6d
KW
498 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
499
3018b823
KW
500 if (classnum < _FIRST_NON_SWASH_CC) {
501
502 /* Initialize the swash unless done already */
503 if (! PL_utf8_swash_ptrs[classnum]) {
504 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
505 PL_utf8_swash_ptrs[classnum] =
506 _core_swash_init("utf8",
507 "",
508 &PL_sv_undef, 1, 0,
509 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
510 }
511
92a2046b
KW
512 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
513 character,
514 TRUE /* is UTF */ ));
3018b823
KW
515 }
516
517 switch ((_char_class_number) classnum) {
779cf272 518 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
519 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
520 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
521 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 522 default: break;
3018b823
KW
523 }
524
e1ee3960 525 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
526}
527
a687059c 528/*
e50aee73 529 * pregexec and friends
a687059c
LW
530 */
531
76234dfb 532#ifndef PERL_IN_XSUB_RE
a687059c 533/*
c277df42 534 - pregexec - match a regexp against a string
a687059c 535 */
c277df42 536I32
5aaab254 537Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 538 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
539/* stringarg: the point in the string at which to begin matching */
540/* strend: pointer to null at end of string */
541/* strbeg: real beginning of string */
542/* minend: end of match must be >= minend bytes after stringarg. */
543/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
544 * itself is accessed via the pointers above */
545/* nosave: For optimizations. */
c277df42 546{
7918f24d
NC
547 PERL_ARGS_ASSERT_PREGEXEC;
548
c277df42 549 return
9041c2e3 550 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
551 nosave ? 0 : REXEC_COPY_STR);
552}
76234dfb 553#endif
22e551b9 554
cad2e5aa 555
6eb5f6b9 556
1a4edc3c
DM
557/* re_intuit_start():
558 *
559 * Based on some optimiser hints, try to find the earliest position in the
560 * string where the regex could match.
561 *
562 * rx: the regex to match against
563 * sv: the SV being matched: only used for utf8 flag; the string
564 * itself is accessed via the pointers below. Note that on
565 * something like an overloaded SV, SvPOK(sv) may be false
566 * and the string pointers may point to something unrelated to
567 * the SV itself.
568 * strbeg: real beginning of string
569 * strpos: the point in the string at which to begin matching
570 * strend: pointer to the byte following the last char of the string
571 * flags currently unused; set to 0
572 * data: currently unused; set to NULL
573 *
574 * The basic idea of re_intuit_start() is to use some known information
575 * about the pattern, namely:
576 *
577 * a) the longest known anchored substring (i.e. one that's at a
578 * constant offset from the beginning of the pattern; but not
579 * necessarily at a fixed offset from the beginning of the
580 * string);
581 * b) the longest floating substring (i.e. one that's not at a constant
582 * offset from the beginning of the pattern);
583 * c) Whether the pattern is anchored to the string; either
584 * an absolute anchor: /^../, or anchored to \n: /^.../m,
585 * or anchored to pos(): /\G/;
586 * d) A start class: a real or synthetic character class which
587 * represents which characters are legal at the start of the pattern;
588 *
589 * to either quickly reject the match, or to find the earliest position
590 * within the string at which the pattern might match, thus avoiding
591 * running the full NFA engine at those earlier locations, only to
592 * eventually fail and retry further along.
593 *
594 * Returns NULL if the pattern can't match, or returns the address within
595 * the string which is the earliest place the match could occur.
596 *
597 * The longest of the anchored and floating substrings is called 'check'
598 * and is checked first. The other is called 'other' and is checked
599 * second. The 'other' substring may not be present. For example,
600 *
601 * /(abc|xyz)ABC\d{0,3}DEFG/
602 *
603 * will have
604 *
605 * check substr (float) = "DEFG", offset 6..9 chars
606 * other substr (anchored) = "ABC", offset 3..3 chars
607 * stclass = [ax]
608 *
609 * Be aware that during the course of this function, sometimes 'anchored'
610 * refers to a substring being anchored relative to the start of the
611 * pattern, and sometimes to the pattern itself being anchored relative to
612 * the string. For example:
613 *
614 * /\dabc/: "abc" is anchored to the pattern;
615 * /^\dabc/: "abc" is anchored to the pattern and the string;
616 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
617 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
618 * but the pattern is anchored to the string.
52a21eb3
DM
619 */
620
cad2e5aa 621char *
52a21eb3
DM
622Perl_re_intuit_start(pTHX_
623 REGEXP * const rx,
624 SV *sv,
625 const char * const strbeg,
626 char *strpos,
627 char *strend,
628 const U32 flags,
629 re_scream_pos_data *data)
cad2e5aa 630{
8d919b0a 631 struct regexp *const prog = ReANY(rx);
6b071d16 632 SSize_t start_shift = prog->check_offset_min;
cad2e5aa 633 /* Should be nonnegative! */
ea3daa5d 634 SSize_t end_shift = 0;
0fc004dd
DM
635 /* current lowest pos in string where the regex can start matching */
636 char *rx_origin = strpos;
eb578fdb 637 SV *check;
f2ed9b32 638 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 639 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 640 bool ml_anch = 0;
8f4bf5fc 641 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 642 char *check_at = NULL; /* check substr found at this pos */
bbe252da 643 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 644 RXi_GET_DECL(prog,progi);
02d5137b
DM
645 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
646 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
647 GET_RE_DEBUG_FLAGS_DECL;
648
7918f24d 649 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
650 PERL_UNUSED_ARG(flags);
651 PERL_UNUSED_ARG(data);
7918f24d 652
1dc475d0
DM
653 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
654 "Intuit: trying to determine minimum start position...\n"));
655
fb9bbddb
DM
656 /* for now, assume that all substr offsets are positive. If at some point
657 * in the future someone wants to do clever things with look-behind and
658 * -ve offsets, they'll need to fix up any code in this function
659 * which uses these offsets. See the thread beginning
660 * <20140113145929.GF27210@iabyn.com>
661 */
662 assert(prog->substrs->data[0].min_offset >= 0);
663 assert(prog->substrs->data[0].max_offset >= 0);
664 assert(prog->substrs->data[1].min_offset >= 0);
665 assert(prog->substrs->data[1].max_offset >= 0);
666 assert(prog->substrs->data[2].min_offset >= 0);
667 assert(prog->substrs->data[2].max_offset >= 0);
668
f7022b5a 669 /* for now, assume that if both present, that the floating substring
83f2232d 670 * doesn't start before the anchored substring.
f7022b5a
DM
671 * If you break this assumption (e.g. doing better optimisations
672 * with lookahead/behind), then you'll need to audit the code in this
673 * function carefully first
674 */
675 assert(
676 ! ( (prog->anchored_utf8 || prog->anchored_substr)
677 && (prog->float_utf8 || prog->float_substr))
678 || (prog->float_min_offset >= prog->anchored_offset));
679
1a4edc3c
DM
680 /* byte rather than char calculation for efficiency. It fails
681 * to quickly reject some cases that can't match, but will reject
682 * them later after doing full char arithmetic */
c344f387 683 if (prog->minlen > strend - strpos) {
a3621e74 684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 685 " String too short...\n"));
cad2e5aa 686 goto fail;
2c2d71f5 687 }
d8da0584 688
ab4e48c1 689 RX_MATCH_UTF8_set(rx,utf8_target);
6c3fea77 690 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 691 reginfo->info_aux = NULL;
9d9163fb 692 reginfo->strbeg = strbeg;
220db18a 693 reginfo->strend = strend;
aed7b151 694 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 695 reginfo->intuit = 1;
1cb48e53
DM
696 /* not actually used within intuit, but zero for safety anyway */
697 reginfo->poscache_maxiter = 0;
02d5137b 698
f2ed9b32 699 if (utf8_target) {
33b8afdf
JH
700 if (!prog->check_utf8 && prog->check_substr)
701 to_utf8_substr(prog);
702 check = prog->check_utf8;
703 } else {
7e0d5ad7
KW
704 if (!prog->check_substr && prog->check_utf8) {
705 if (! to_byte_substr(prog)) {
6b54ddc5 706 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
707 }
708 }
33b8afdf
JH
709 check = prog->check_substr;
710 }
274cd312 711
1dc475d0
DM
712 /* dump the various substring data */
713 DEBUG_OPTIMISE_MORE_r({
714 int i;
715 for (i=0; i<=2; i++) {
716 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
717 : prog->substrs->data[i].substr);
718 if (!sv)
719 continue;
720
721 PerlIO_printf(Perl_debug_log,
722 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
723 " useful=%"IVdf" utf8=%d [%s]\n",
724 i,
725 (IV)prog->substrs->data[i].min_offset,
726 (IV)prog->substrs->data[i].max_offset,
727 (IV)prog->substrs->data[i].end_shift,
728 BmUSEFUL(sv),
729 utf8_target ? 1 : 0,
730 SvPEEK(sv));
731 }
732 });
733
8e1490ee 734 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
735
736 /* ml_anch: check after \n?
737 *
0fa70a06 738 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
739 * with /.*.../, these flags will have been added by the
740 * compiler:
741 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
742 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
743 */
7d2d37f5
DM
744 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
745 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 746
343c8a29 747 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
748 /* we are only allowed to match at BOS or \G */
749
57fcbfa7 750 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
751 *
752 * Note that we don't try to do a similar quick reject for
753 * \G, since generally the caller will have calculated strpos
754 * based on pos() and gofs, so the string is already correctly
755 * anchored by definition; and handling the exceptions would
756 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 757 */
7bb3b9eb 758 if ( strpos != strbeg
d3d47aac 759 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8
DM
760 {
761 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 762 " Not at start...\n"));
c889ccc8
DM
763 goto fail;
764 }
765
a5d12a4b
DM
766 /* in the presence of an anchor, the anchored (relative to the
767 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
768 * to strpos. So quickly reject if substr isn't found there.
769 * This works for \G too, because the caller will already have
770 * subtracted gofs from pos, and gofs is the offset from the
771 * \G to the start of the regex. For example, in /.abc\Gdef/,
772 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
773 * caller will have set strpos=pos()-4; we look for the substr
774 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 775
33c28ab2 776 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 777 /* Substring at constant offset from beg-of-str... */
d307bf57 778 SSize_t slen = SvCUR(check);
343c8a29 779 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 780
1dc475d0
DM
781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
782 " Looking for check substr at fixed offset %"IVdf"...\n",
783 (IV)prog->check_offset_min));
784
7742aa66
DM
785 if (SvTAIL(check)) {
786 /* In this case, the regex is anchored at the end too.
787 * Unless it's a multiline match, the lengths must match
788 * exactly, give or take a \n. NB: slen >= 1 since
789 * the last char of check is \n */
790 if (!multiline
791 && ( strend - s > slen
792 || strend - s < slen - 1
793 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8
DM
794 {
795 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 796 " String too long...\n"));
c889ccc8
DM
797 goto fail_finish;
798 }
799 /* Now should match s[0..slen-2] */
800 slen--;
c889ccc8 801 }
d307bf57
DM
802 if (slen && (*SvPVX_const(check) != *s
803 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
804 {
805 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 806 " String not equal...\n"));
d307bf57
DM
807 goto fail_finish;
808 }
c889ccc8
DM
809
810 check_at = s;
811 goto success_at_start;
cad2e5aa 812 }
cad2e5aa 813 }
cad2e5aa 814 }
0fc004dd 815
c0e0ec46 816 end_shift = prog->check_end_shift;
cad2e5aa 817
19188028 818#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 819 if (end_shift < 0)
1de06328 820 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
220fc49f 821 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
822#endif
823
2c2d71f5 824 restart:
1de06328 825
66b7ec5c
DM
826 /* This is the (re)entry point of the main loop in this function.
827 * The goal of this loop is to:
828 * 1) find the "check" substring in the region rx_origin..strend
829 * (adjusted by start_shift / end_shift). If not found, reject
830 * immediately.
831 * 2) If it exists, look for the "other" substr too if defined; for
832 * example, if the check substr maps to the anchored substr, then
833 * check the floating substr, and vice-versa. If not found, go
834 * back to (1) with rx_origin suitably incremented.
835 * 3) If we find an rx_origin position that doesn't contradict
836 * either of the substrings, then check the possible additional
837 * constraints on rx_origin of /^.../m or a known start class.
838 * If these fail, then depending on which constraints fail, jump
839 * back to here, or to various other re-entry points further along
840 * that skip some of the first steps.
841 * 4) If we pass all those tests, update the BmUSEFUL() count on the
842 * substring. If the start position was determined to be at the
843 * beginning of the string - so, not rejected, but not optimised,
844 * since we have to run regmatch from position 0 - decrement the
845 * BmUSEFUL() count. Otherwise increment it.
846 */
847
1a4edc3c
DM
848
849 /* first, look for the 'check' substring */
850
1de06328 851 {
c33e64f0
FC
852 U8* start_point;
853 U8* end_point;
c889ccc8 854
c889ccc8 855 DEBUG_OPTIMISE_MORE_r({
1dc475d0 856 PerlIO_printf(Perl_debug_log,
ae5d4331 857 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
1dc475d0 858 " Start shift: %"IVdf" End shift %"IVdf
4d281893 859 " Real end Shift: %"IVdf"\n",
675e93ee 860 (IV)(rx_origin - strbeg),
c889ccc8 861 (IV)prog->check_offset_min,
12fbc530
DM
862 (IV)start_shift,
863 (IV)end_shift,
c889ccc8
DM
864 (IV)prog->check_end_shift);
865 });
1de06328 866
33c28ab2
DM
867 end_point = HOP3(strend, -end_shift, strbeg);
868 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
869 if (!start_point)
870 goto fail_finish;
c889ccc8 871
557f47af 872
e0362b86 873 /* If the regex is absolutely anchored to either the start of the
d3d47aac 874 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
875 * check_offset_max represents an upper bound on the string where
876 * the substr could start. For the ANCH_GPOS case, we assume that
877 * the caller of intuit will have already set strpos to
878 * pos()-gofs, so in this case strpos + offset_max will still be
879 * an upper bound on the substr.
880 */
c19c836a
DM
881 if (!ml_anch
882 && prog->intflags & PREGf_ANCH
e0362b86 883 && prog->check_offset_max != SSize_t_MAX)
c19c836a 884 {
1a08ba3a 885 SSize_t len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
886 const char * const anchor =
887 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
888
889 /* do a bytes rather than chars comparison. It's conservative;
890 * so it skips doing the HOP if the result can't possibly end
891 * up earlier than the old value of end_point.
892 */
893 if ((char*)end_point - anchor > prog->check_offset_max) {
894 end_point = HOP3lim((U8*)anchor,
895 prog->check_offset_max,
896 end_point -len)
897 + len;
898 }
d6ef1678
DM
899 }
900
ae5d4331 901 check_at = fbm_instr( start_point, end_point,
7fba1cd6 902 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 903
675e93ee
DM
904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
905 " doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
906 (IV)((char*)start_point - strbeg),
907 (IV)((char*)end_point - strbeg),
908 (IV)(check_at ? check_at - strbeg : -1)
909 ));
910
8fd34720
DM
911 /* Update the count-of-usability, remove useless subpatterns,
912 unshift s. */
913
914 DEBUG_EXECUTE_r({
915 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
916 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
917 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
918 (check_at ? "Found" : "Did not find"),
919 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
920 ? "anchored" : "floating"),
921 quoted,
922 RE_SV_TAIL(check),
923 (check_at ? " at offset " : "...\n") );
924 });
2c2d71f5 925
8fd34720
DM
926 if (!check_at)
927 goto fail_finish;
8fd34720
DM
928 /* set rx_origin to the minimum position where the regex could start
929 * matching, given the constraint of the just-matched check substring.
930 * But don't set it lower than previously.
931 */
fdc003fd 932
8fd34720
DM
933 if (check_at - rx_origin > prog->check_offset_max)
934 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee
DM
935 /* Finish the diagnostic message */
936 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
937 "%ld (rx_origin now %"IVdf")...\n",
938 (long)(check_at - strbeg),
939 (IV)(rx_origin - strbeg)
940 ));
8fd34720 941 }
fdc003fd
DM
942
943
1a4edc3c 944 /* now look for the 'other' substring if defined */
2c2d71f5 945
6480a6c4
DM
946 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
947 : prog->substrs->data[other_ix].substr)
1de06328 948 {
30944b6d 949 /* Take into account the "other" substring. */
6c3343a6
DM
950 char *last, *last1;
951 char *s;
952 SV* must;
953 struct reg_substr_datum *other;
954
955 do_other_substr:
956 other = &prog->substrs->data[other_ix];
957
958 /* if "other" is anchored:
959 * we've previously found a floating substr starting at check_at.
960 * This means that the regex origin must lie somewhere
961 * between min (rx_origin): HOP3(check_at, -check_offset_max)
962 * and max: HOP3(check_at, -check_offset_min)
963 * (except that min will be >= strpos)
964 * So the fixed substr must lie somewhere between
965 * HOP3(min, anchored_offset)
966 * HOP3(max, anchored_offset) + SvCUR(substr)
967 */
968
969 /* if "other" is floating
970 * Calculate last1, the absolute latest point where the
971 * floating substr could start in the string, ignoring any
972 * constraints from the earlier fixed match. It is calculated
973 * as follows:
974 *
975 * strend - prog->minlen (in chars) is the absolute latest
976 * position within the string where the origin of the regex
977 * could appear. The latest start point for the floating
978 * substr is float_min_offset(*) on from the start of the
979 * regex. last1 simply combines thee two offsets.
980 *
981 * (*) You might think the latest start point should be
982 * float_max_offset from the regex origin, and technically
983 * you'd be correct. However, consider
984 * /a\d{2,4}bcd\w/
985 * Here, float min, max are 3,5 and minlen is 7.
986 * This can match either
987 * /a\d\dbcd\w/
988 * /a\d\d\dbcd\w/
989 * /a\d\d\d\dbcd\w/
990 * In the first case, the regex matches minlen chars; in the
991 * second, minlen+1, in the third, minlen+2.
992 * In the first case, the floating offset is 3 (which equals
993 * float_min), in the second, 4, and in the third, 5 (which
994 * equals float_max). In all cases, the floating string bcd
995 * can never start more than 4 chars from the end of the
996 * string, which equals minlen - float_min. As the substring
997 * starts to match more than float_min from the start of the
998 * regex, it makes the regex match more than minlen chars,
999 * and the two cancel each other out. So we can always use
1000 * float_min - minlen, rather than float_max - minlen for the
1001 * latest position in the string.
1002 *
1003 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1004 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1005 */
1006
e7a14a9c 1007 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1008 last1 = HOP3c(strend,
1009 other->min_offset - prog->minlen, strbeg);
1010
4d006249 1011 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1012 /* last is the latest point where the floating substr could
1013 * start, *given* any constraints from the earlier fixed
1014 * match. This constraint is that the floating string starts
1015 * <= float_max_offset chars from the regex origin (rx_origin).
1016 * If this value is less than last1, use it instead.
eb3831ce 1017 */
6c3343a6
DM
1018 assert(rx_origin <= last1);
1019 last =
1020 /* this condition handles the offset==infinity case, and
1021 * is a short-cut otherwise. Although it's comparing a
1022 * byte offset to a char length, it does so in a safe way,
1023 * since 1 char always occupies 1 or more bytes,
1024 * so if a string range is (last1 - rx_origin) bytes,
1025 * it will be less than or equal to (last1 - rx_origin)
1026 * chars; meaning it errs towards doing the accurate HOP3
1027 * rather than just using last1 as a short-cut */
1028 (last1 - rx_origin) < other->max_offset
1029 ? last1
1030 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1031 }
1032 else {
1033 assert(strpos + start_shift <= check_at);
1034 last = HOP4c(check_at, other->min_offset - start_shift,
1035 strbeg, strend);
1036 }
ead917d0 1037
6c3343a6
DM
1038 s = HOP3c(rx_origin, other->min_offset, strend);
1039 if (s < other_last) /* These positions already checked */
1040 s = other_last;
1041
1042 must = utf8_target ? other->utf8_substr : other->substr;
1043 assert(SvPOK(must));
675e93ee
DM
1044 {
1045 char *from = s;
1046 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1047
88203927
DM
1048 if (from > to) {
1049 s = NULL;
1050 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1051 " skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1052 (IV)(from - strbeg),
1053 (IV)(to - strbeg)
1054 ));
1055 }
1056 else {
1057 s = fbm_instr(
1058 (unsigned char*)from,
1059 (unsigned char*)to,
1060 must,
1061 multiline ? FBMrf_MULTILINE : 0
1062 );
1063 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1064 " doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1065 (IV)(from - strbeg),
1066 (IV)(to - strbeg),
1067 (IV)(s ? s - strbeg : -1)
1068 ));
1069 }
675e93ee
DM
1070 }
1071
6c3343a6
DM
1072 DEBUG_EXECUTE_r({
1073 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1074 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1075 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1076 s ? "Found" : "Contradicts",
1077 other_ix ? "floating" : "anchored",
1078 quoted, RE_SV_TAIL(must));
1079 });
ead917d0 1080
ead917d0 1081
6c3343a6
DM
1082 if (!s) {
1083 /* last1 is latest possible substr location. If we didn't
1084 * find it before there, we never will */
1085 if (last >= last1) {
1086 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee 1087 "; giving up...\n"));
6c3343a6 1088 goto fail_finish;
ead917d0
DM
1089 }
1090
6c3343a6
DM
1091 /* try to find the check substr again at a later
1092 * position. Maybe next time we'll find the "other" substr
1093 * in range too */
6c3343a6
DM
1094 other_last = HOP3c(last, 1, strend) /* highest failure */;
1095 rx_origin =
4d006249 1096 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1097 ? HOP3c(rx_origin, 1, strend)
1098 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
675e93ee
DM
1099 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1100 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1101 (other_ix ? "floating" : "anchored"),
1102 (long)(HOP3c(check_at, 1, strend) - strbeg),
1103 (IV)(rx_origin - strbeg)
1104 ));
6c3343a6
DM
1105 goto restart;
1106 }
1107 else {
4d006249 1108 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1109 /* other_last is set to s, not s+1, since its possible for
1110 * a floating substr to fail first time, then succeed
1111 * second time at the same floating position; e.g.:
1112 * "-AB--AABZ" =~ /\wAB\d*Z/
1113 * The first time round, anchored and float match at
1114 * "-(AB)--AAB(Z)" then fail on the initial \w character
1115 * class. Second time round, they match at "-AB--A(AB)(Z)".
1116 */
1117 other_last = s;
ead917d0
DM
1118 }
1119 else {
6c3343a6
DM
1120 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1121 other_last = HOP3c(s, 1, strend);
ead917d0 1122 }
675e93ee
DM
1123 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1124 " at offset %ld (rx_origin now %"IVdf")...\n",
1125 (long)(s - strbeg),
1126 (IV)(rx_origin - strbeg)
1127 ));
1128
6c3343a6 1129 }
cad2e5aa 1130 }
acba93e8
DM
1131 else {
1132 DEBUG_OPTIMISE_MORE_r(
1133 PerlIO_printf(Perl_debug_log,
1134 " Check-only match: offset min:%"IVdf" max:%"IVdf
1c1c599d 1135 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
675e93ee 1136 " strend:%"IVdf"\n",
acba93e8
DM
1137 (IV)prog->check_offset_min,
1138 (IV)prog->check_offset_max,
675e93ee
DM
1139 (IV)(check_at-strbeg),
1140 (IV)(rx_origin-strbeg),
1c1c599d 1141 (IV)(rx_origin-check_at),
675e93ee 1142 (IV)(strend-strbeg)
acba93e8
DM
1143 )
1144 );
1145 }
2c2d71f5 1146
acba93e8 1147 postprocess_substr_matches:
0991020e 1148
1a4edc3c 1149 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1150
7d2d37f5 1151 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1152 char *s;
1153
a62659bd
DM
1154 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1155 " looking for /^/m anchor"));
d0880ea7
DM
1156
1157 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1158 * Find the next \n, if any, even if it's beyond the current
1159 * anchored and/or floating substrings. Whether we should be
1160 * scanning ahead for the next \n or the next substr is debatable.
1161 * On the one hand you'd expect rare substrings to appear less
1162 * often than \n's. On the other hand, searching for \n means
675e93ee 1163 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1164 * iteration as the current "rarest" string candidate, which
1165 * means for example that we'll quickly reject the whole string if
1166 * hasn't got a \n, rather than trying every substr position
1167 * first
1168 */
d0880ea7 1169
4620cb61
DM
1170 s = HOP3c(strend, - prog->minlen, strpos);
1171 if (s <= rx_origin ||
1172 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1173 {
d0880ea7
DM
1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1175 " Did not find /%s^%s/m...\n",
1176 PL_colors[0], PL_colors[1]));
a62659bd
DM
1177 goto fail_finish;
1178 }
d0880ea7 1179
4ada1233
DM
1180 /* earliest possible origin is 1 char after the \n.
1181 * (since *rx_origin == '\n', it's safe to ++ here rather than
1182 * HOP(rx_origin, 1)) */
1183 rx_origin++;
d0880ea7 1184
f4f115de 1185 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1186 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1187 {
d0880ea7
DM
1188 /* Position contradicts check-string; either because
1189 * check was anchored (and thus has no wiggle room),
4ada1233 1190 * or check was float and rx_origin is above the float range */
d0880ea7 1191 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee
DM
1192 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1193 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1194 goto restart;
1195 }
1196
1197 /* if we get here, the check substr must have been float,
2e759faa 1198 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1199 * "other" substr which still contradicts */
1200 assert(prog->substrs->check_ix); /* check is float */
1201
1202 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1203 /* whoops, the anchored "other" substr exists, so we still
1204 * contradict. On the other hand, the float "check" substr
1205 * didn't contradict, so just retry the anchored "other"
1206 * substr */
1207 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee 1208 " Found /%s^%s/m, rescanning for anchored from offset %ld (rx_origin now %"IVdf")...\n",
d0880ea7 1209 PL_colors[0], PL_colors[1],
675e93ee
DM
1210 (long)(rx_origin - strbeg + prog->anchored_offset),
1211 (long)(rx_origin - strbeg)
1212 ));
d0880ea7
DM
1213 goto do_other_substr;
1214 }
1215
1216 /* success: we don't contradict the found floating substring
1217 * (and there's no anchored substr). */
d0880ea7 1218 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
675e93ee
DM
1219 " Found /%s^%s/m with rx_origin %ld...\n",
1220 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1221 }
1222 else {
2e759faa 1223 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
fe4f3442 1224 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1225 }
1226
ffad1e6a 1227 success_at_start:
e3c6feb0 1228
cad2e5aa 1229
dd170ff5
DM
1230 /* if we have a starting character class, then test that extra constraint.
1231 * (trie stclasses are too expensive to use here, we are better off to
1232 * leave it to regmatch itself) */
1233
f8fc2ecf 1234 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1235 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1236
2c75e362 1237 /* XXX this value could be pre-computed */
f8fc2ecf 1238 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1239 ? (reginfo->is_utf8_pat
1240 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1241 : STR_LEN(progi->regstclass))
66e933ab 1242 : 1);
1de06328 1243 char * endpos;
fa3bb21d 1244 char *s;
000dfd2d
DM
1245 /* latest pos that a matching float substr constrains rx start to */
1246 char *rx_max_float = NULL;
1247
c75a3985
DM
1248 /* if the current rx_origin is anchored, either by satisfying an
1249 * anchored substring constraint, or a /^.../m constraint, then we
1250 * can reject the current origin if the start class isn't found
1251 * at the current position. If we have a float-only match, then
1252 * rx_origin is constrained to a range; so look for the start class
1253 * in that range. if neither, then look for the start class in the
1254 * whole rest of the string */
1255
dd170ff5
DM
1256 /* XXX DAPM it's not clear what the minlen test is for, and why
1257 * it's not used in the floating case. Nothing in the test suite
1258 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1259 * Here are some old comments, which may or may not be correct:
1260 *
1261 * minlen == 0 is possible if regstclass is \b or \B,
1262 * and the fixed substr is ''$.
1263 * Since minlen is already taken into account, rx_origin+1 is
1264 * before strend; accidentally, minlen >= 1 guaranties no false
1265 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1266 * 0) below assumes that regstclass does not come from lookahead...
1267 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1268 * This leaves EXACTF-ish only, which are dealt with in
1269 * find_byclass().
1270 */
1271
7d2d37f5 1272 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
fa3bb21d 1273 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d
DM
1274 else if (prog->float_substr || prog->float_utf8) {
1275 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1276 endpos= HOP3c(rx_max_float, cl_l, strend);
1277 }
1de06328
YO
1278 else
1279 endpos= strend;
1280
1dc475d0
DM
1281 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1282 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
c43b5520 1283 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1dc475d0 1284 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1285 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1286
c43b5520 1287 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1288 reginfo);
be778b1a 1289 if (!s) {
6eb5f6b9 1290 if (endpos == strend) {
a3621e74 1291 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1292 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1293 goto fail;
1294 }
a3621e74 1295 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1dc475d0 1296 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1297 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1298 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1299 goto fail;
9fed8d02 1300
6eb5f6b9 1301 /* Contradict one of substrings */
97136c8a
DM
1302 if (prog->anchored_substr || prog->anchored_utf8) {
1303 if (prog->substrs->check_ix == 1) { /* check is float */
1304 /* Have both, check_string is floating */
1305 assert(rx_origin + start_shift <= check_at);
1306 if (rx_origin + start_shift != check_at) {
1307 /* not at latest position float substr could match:
c75a3985
DM
1308 * Recheck anchored substring, but not floating.
1309 * The condition above is in bytes rather than
1310 * chars for efficiency. It's conservative, in
1311 * that it errs on the side of doing 'goto
88203927
DM
1312 * do_other_substr'. In this case, at worst,
1313 * an extra anchored search may get done, but in
1314 * practice the extra fbm_instr() is likely to
1315 * get skipped anyway. */
97136c8a 1316 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee
DM
1317 " about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1318 (long)(other_last - strbeg),
1319 (IV)(rx_origin - strbeg)
1320 ));
97136c8a 1321 goto do_other_substr;
3369914b 1322 }
3369914b
DM
1323 }
1324 }
97136c8a 1325 else {
9fed8d02
DM
1326 /* float-only */
1327
7d2d37f5 1328 if (ml_anch) {
c75a3985
DM
1329 /* In the presence of ml_anch, we might be able to
1330 * find another \n without breaking the current float
1331 * constraint. */
1332
1333 /* strictly speaking this should be HOP3c(..., 1, ...),
1334 * but since we goto a block of code that's going to
1335 * search for the next \n if any, its safe here */
9fed8d02 1336 rx_origin++;
9fed8d02 1337 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee 1338 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1339 PL_colors[0], PL_colors[1],
675e93ee 1340 (long)(rx_origin - strbeg)) );
9fed8d02 1341 goto postprocess_substr_matches;
ab60c45a 1342 }
c75a3985
DM
1343
1344 /* strictly speaking this can never be true; but might
1345 * be if we ever allow intuit without substrings */
1346 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1347 goto fail;
c75a3985 1348
000dfd2d 1349 rx_origin = rx_max_float;
9fed8d02
DM
1350 }
1351
c75a3985
DM
1352 /* at this point, any matching substrings have been
1353 * contradicted. Start again... */
1354
9fed8d02 1355 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1356
1357 /* uses bytes rather than char calculations for efficiency.
1358 * It's conservative: it errs on the side of doing 'goto restart',
1359 * where there is code that does a proper char-based test */
9fed8d02 1360 if (rx_origin + start_shift + end_shift > strend) {
40268e5b 1361 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
9fed8d02
DM
1362 " Could not match STCLASS...\n") );
1363 goto fail;
1364 }
1365 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
675e93ee 1366 " about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
9fed8d02 1367 (prog->substrs->check_ix ? "floating" : "anchored"),
675e93ee
DM
1368 (long)(rx_origin + start_shift - strbeg),
1369 (IV)(rx_origin - strbeg)
1370 ));
9fed8d02 1371 goto restart;
6eb5f6b9 1372 }
9fed8d02 1373
c75a3985
DM
1374 /* Success !!! */
1375
5f9c6575 1376 if (rx_origin != s) {
a3621e74 1377 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1378 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1379 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1380 );
1381 }
1382 else {
a3621e74 1383 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1dc475d0 1384 " Does not contradict STCLASS...\n");
b7953727
JH
1385 );
1386 }
6eb5f6b9 1387 }
ffad1e6a
DM
1388
1389 /* Decide whether using the substrings helped */
1390
1391 if (rx_origin != strpos) {
1392 /* Fixed substring is found far enough so that the match
1393 cannot start at strpos. */
1394
1395 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1396 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1397 }
1398 else {
70563e16
DM
1399 /* The found rx_origin position does not prohibit matching at
1400 * strpos, so calling intuit didn't gain us anything. Decrement
1401 * the BmUSEFUL() count on the check substring, and if we reach
1402 * zero, free it. */
1403 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1404 && (utf8_target ? (
1405 prog->check_utf8 /* Could be deleted already */
1406 && --BmUSEFUL(prog->check_utf8) < 0
1407 && (prog->check_utf8 == prog->float_utf8)
1408 ) : (
1409 prog->check_substr /* Could be deleted already */
1410 && --BmUSEFUL(prog->check_substr) < 0
1411 && (prog->check_substr == prog->float_substr)
1412 )))
1413 {
1414 /* If flags & SOMETHING - do not do it many times on the same match */
1415 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1416 /* XXX Does the destruction order has to change with utf8_target? */
1417 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1418 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1419 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1420 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1421 check = NULL; /* abort */
ffad1e6a
DM
1422 /* XXXX This is a remnant of the old implementation. It
1423 looks wasteful, since now INTUIT can use many
1424 other heuristics. */
1425 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1426 }
1427 }
1428
1429 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1430 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1431 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1432
c765d6e0 1433 return rx_origin;
2c2d71f5
JH
1434
1435 fail_finish: /* Substring not found */
33b8afdf 1436 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1437 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1438 fail:
a3621e74 1439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
e4584336 1440 PL_colors[4], PL_colors[5]));
bd61b366 1441 return NULL;
cad2e5aa 1442}
9661b544 1443
70563e16 1444
a0a388a1 1445#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1446 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78
KW
1447 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1448 trie_utf8l, trie_flu8 } \
e7fd4aa1
KW
1449 trie_type = ((scan->flags == EXACT) \
1450 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1451 : (scan->flags == EXACTL) \
1452 ? (utf8_target ? trie_utf8l : trie_plain) \
1453 : (scan->flags == EXACTFA) \
1454 ? (utf8_target \
1455 ? trie_utf8_exactfa_fold \
1456 : trie_latin_utf8_exactfa_fold) \
1457 : (scan->flags == EXACTFLU8 \
1458 ? trie_flu8 \
1459 : (utf8_target \
1460 ? trie_utf8_fold \
1461 : trie_latin_utf8_fold)))
fab2782b 1462
fd3249ee 1463#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1464STMT_START { \
fab2782b 1465 STRLEN skiplen; \
baa60164 1466 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1467 switch (trie_type) { \
a4525e78 1468 case trie_flu8: \
780fcc9f 1469 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1470 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1471 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1472 } \
a4525e78 1473 goto do_trie_utf8_fold; \
31f05a37 1474 case trie_utf8_exactfa_fold: \
baa60164 1475 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1476 /* FALLTHROUGH */ \
fab2782b 1477 case trie_utf8_fold: \
a4525e78 1478 do_trie_utf8_fold: \
fab2782b 1479 if ( foldlen>0 ) { \
c80e42f3 1480 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1481 foldlen -= len; \
1482 uscan += len; \
1483 len=0; \
1484 } else { \
445bf929 1485 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1486 len = UTF8SKIP(uc); \
1487 skiplen = UNISKIP( uvc ); \
1488 foldlen -= skiplen; \
1489 uscan = foldbuf + skiplen; \
1490 } \
1491 break; \
baa60164
KW
1492 case trie_latin_utf8_exactfa_fold: \
1493 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1494 /* FALLTHROUGH */ \
fab2782b
YO
1495 case trie_latin_utf8_fold: \
1496 if ( foldlen>0 ) { \
c80e42f3 1497 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1498 foldlen -= len; \
1499 uscan += len; \
1500 len=0; \
1501 } else { \
1502 len = 1; \
31f05a37 1503 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
fab2782b
YO
1504 skiplen = UNISKIP( uvc ); \
1505 foldlen -= skiplen; \
1506 uscan = foldbuf + skiplen; \
1507 } \
1508 break; \
a4525e78 1509 case trie_utf8l: \
780fcc9f 1510 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1511 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1512 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1513 } \
780fcc9f 1514 /* FALLTHROUGH */ \
fab2782b 1515 case trie_utf8: \
c80e42f3 1516 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1517 break; \
1518 case trie_plain: \
1519 uvc = (UV)*uc; \
1520 len = 1; \
1521 } \
1522 if (uvc < 256) { \
1523 charid = trie->charmap[ uvc ]; \
1524 } \
1525 else { \
1526 charid = 0; \
1527 if (widecharmap) { \
1528 SV** const svpp = hv_fetch(widecharmap, \
1529 (char*)&uvc, sizeof(UV), 0); \
1530 if (svpp) \
1531 charid = (U16)SvIV(*svpp); \
1532 } \
1533 } \
4cadc6a9
YO
1534} STMT_END
1535
ae7c5b9b
KW
1536#define DUMP_EXEC_POS(li,s,doutf8) \
1537 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1538 startpos, doutf8)
1539
c84a03c5 1540#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1541STMT_START { \
1542 while (s <= e) { \
c84a03c5 1543 if ( (COND) \
fac1af77 1544 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1545 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1546 goto got_it; \
1547 s++; \
1548 } \
1549} STMT_END
1550
c84a03c5 1551#define REXEC_FBC_UTF8_SCAN(CODE) \
4cadc6a9 1552STMT_START { \
9a902117 1553 while (s < strend) { \
c84a03c5 1554 CODE \
9a902117 1555 s += UTF8SKIP(s); \
4cadc6a9
YO
1556 } \
1557} STMT_END
1558
c84a03c5 1559#define REXEC_FBC_SCAN(CODE) \
4cadc6a9
YO
1560STMT_START { \
1561 while (s < strend) { \
c84a03c5 1562 CODE \
4cadc6a9
YO
1563 s++; \
1564 } \
1565} STMT_END
1566
05bd126c
KW
1567#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1568REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1569 if (COND) { \
1570 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1571 goto got_it; \
1572 else \
1573 tmp = doevery; \
1574 } \
1575 else \
1576 tmp = 1; \
4cadc6a9
YO
1577)
1578
05bd126c
KW
1579#define REXEC_FBC_CLASS_SCAN(COND) \
1580REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1581 if (COND) { \
1582 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1583 goto got_it; \
1584 else \
1585 tmp = doevery; \
1586 } \
1587 else \
1588 tmp = 1; \
4cadc6a9
YO
1589)
1590
c84a03c5 1591#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1592 if (utf8_target) { \
c84a03c5 1593 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
e1d1eefb
YO
1594 } \
1595 else { \
c84a03c5 1596 REXEC_FBC_CLASS_SCAN(COND); \
d981ef24 1597 }
05bd126c 1598
05bd126c
KW
1599/* The three macros below are slightly different versions of the same logic.
1600 *
1601 * The first is for /a and /aa when the target string is UTF-8. This can only
1602 * match ascii, but it must advance based on UTF-8. The other two handle the
1603 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1604 * for the boundary (or non-boundary) between a word and non-word character.
1605 * The utf8 and non-utf8 cases have the same logic, but the details must be
1606 * different. Find the "wordness" of the character just prior to this one, and
1607 * compare it with the wordness of this one. If they differ, we have a
1608 * boundary. At the beginning of the string, pretend that the previous
1609 * character was a new-line.
1610 *
1611 * All these macros uncleanly have side-effects with each other and outside
1612 * variables. So far it's been too much trouble to clean-up
1613 *
1614 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1615 * a word character or not.
1616 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1617 * word/non-word
1618 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1619 *
1620 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1621 * are looking for a boundary or for a non-boundary. If we are looking for a
1622 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1623 * see if this tentative match actually works, and if so, to quit the loop
1624 * here. And vice-versa if we are looking for a non-boundary.
1625 *
1626 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1627 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1628 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1629 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1630 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1631 * complement. But in that branch we complement tmp, meaning that at the
1632 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1633 * which means at the top of the loop in the next iteration, it is
1634 * TEST_NON_UTF8(s-1) */
b2f4e957 1635#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1636 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1637 tmp = TEST_NON_UTF8(tmp); \
1638 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1639 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1640 tmp = !tmp; \
1641 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1642 } \
1643 else { \
1644 IF_FAIL; \
1645 } \
1646 ); \
1647
1648/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1649 * TEST_UTF8 is a macro that for the same input code points returns identically
1650 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 1651#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1652 if (s == reginfo->strbeg) { \
1653 tmp = '\n'; \
1654 } \
1655 else { /* Back-up to the start of the previous character */ \
1656 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1657 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1658 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1659 } \
1660 tmp = TEST_UV(tmp); \
1661 LOAD_UTF8_CHARCLASS_ALNUM(); \
1662 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1663 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1664 tmp = !tmp; \
1665 IF_SUCCESS; \
1666 } \
1667 else { \
1668 IF_FAIL; \
1669 } \
1670 );
cfaf538b 1671
05bd126c
KW
1672/* Like the above two macros. UTF8_CODE is the complete code for handling
1673 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1674 * macros below */
baa60164 1675#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1676 if (utf8_target) { \
05bd126c 1677 UTF8_CODE \
63ac0dad
KW
1678 } \
1679 else { /* Not utf8 */ \
9d9163fb 1680 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1681 tmp = TEST_NON_UTF8(tmp); \
05bd126c 1682 REXEC_FBC_SCAN( /* advances s while s < strend */ \
63ac0dad 1683 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1684 IF_SUCCESS; \
760cfa8e 1685 tmp = !tmp; \
63ac0dad
KW
1686 } \
1687 else { \
1688 IF_FAIL; \
1689 } \
1690 ); \
1691 } \
c8519dc7
KW
1692 /* Here, things have been set up by the previous code so that tmp is the \
1693 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1694 * utf8ness of the target). We also have to check if this matches against \
1695 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1696 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1697 * string */ \
1698 if (tmp == ! TEST_NON_UTF8('\n')) { \
1699 IF_SUCCESS; \
1700 } \
1701 else { \
1702 IF_FAIL; \
1703 }
63ac0dad 1704
ae7c5b9b
KW
1705/* This is the macro to use when we want to see if something that looks like it
1706 * could match, actually does, and if so exits the loop */
1707#define REXEC_FBC_TRYIT \
1708 if ((reginfo->intuit || regtry(reginfo, &s))) \
1709 goto got_it
1710
1711/* The only difference between the BOUND and NBOUND cases is that
1712 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1713 * NBOUND. This is accomplished by passing it as either the if or else clause,
1714 * with the other one being empty (PLACEHOLDER is defined as empty).
1715 *
1716 * The TEST_FOO parameters are for operating on different forms of input, but
1717 * all should be ones that return identically for the same underlying code
1718 * points */
1719#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1720 FBC_BOUND_COMMON( \
1721 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1722 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1723
44129e46 1724#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1725 FBC_BOUND_COMMON( \
1726 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1727 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1728
1729#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1730 FBC_BOUND_COMMON( \
1731 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1732 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1733
44129e46 1734#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1735 FBC_BOUND_COMMON( \
1736 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1737 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1738
64935bc6
KW
1739/* Takes a pointer to an inversion list, a pointer to its corresponding
1740 * inversion map, and a code point, and returns the code point's value
1741 * according to the two arrays. It assumes that all code points have a value.
1742 * This is used as the base macro for macros for particular properties */
1743#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
1744 invmap[_invlist_search(invlist, cp)]
1745
1746/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1747 * of a code point, returning the value for the first code point in the string.
1748 * And it takes the particular macro name that finds the desired value given a
1749 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1750#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1751 (__ASSERT_(pos < strend) \
1752 /* Note assumes is valid UTF-8 */ \
1753 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1754
1755/* Returns the GCB value for the input code point */
1756#define getGCB_VAL_CP(cp) \
1757 _generic_GET_BREAK_VAL_CP( \
1758 PL_GCB_invlist, \
02f811dd 1759 _Perl_GCB_invmap, \
64935bc6
KW
1760 (cp))
1761
1762/* Returns the GCB value for the first code point in the UTF-8 encoded string
1763 * bounded by pos and strend */
1764#define getGCB_VAL_UTF8(pos, strend) \
1765 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 1766
06ae2722
KW
1767
1768/* Returns the SB value for the input code point */
1769#define getSB_VAL_CP(cp) \
1770 _generic_GET_BREAK_VAL_CP( \
1771 PL_SB_invlist, \
bf4268fa 1772 _Perl_SB_invmap, \
06ae2722
KW
1773 (cp))
1774
1775/* Returns the SB value for the first code point in the UTF-8 encoded string
1776 * bounded by pos and strend */
1777#define getSB_VAL_UTF8(pos, strend) \
1778 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1779
ae3bb8ea
KW
1780/* Returns the WB value for the input code point */
1781#define getWB_VAL_CP(cp) \
1782 _generic_GET_BREAK_VAL_CP( \
1783 PL_WB_invlist, \
bf4268fa 1784 _Perl_WB_invmap, \
ae3bb8ea
KW
1785 (cp))
1786
1787/* Returns the WB value for the first code point in the UTF-8 encoded string
1788 * bounded by pos and strend */
1789#define getWB_VAL_UTF8(pos, strend) \
1790 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1791
786e8c11 1792/* We know what class REx starts with. Try to find this position... */
02d5137b 1793/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1794/* annoyingly all the vars in this routine have different names from their counterparts
1795 in regmatch. /grrr */
3c3eec57 1796STATIC char *
07be1b83 1797S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1798 const char *strend, regmatch_info *reginfo)
a687059c 1799{
73104a1b
KW
1800 dVAR;
1801 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1802 char *pat_string; /* The pattern's exactish string */
1803 char *pat_end; /* ptr to end char of pat_string */
1804 re_fold_t folder; /* Function for computing non-utf8 folds */
1805 const U8 *fold_array; /* array for folding ords < 256 */
1806 STRLEN ln;
1807 STRLEN lnc;
73104a1b
KW
1808 U8 c1;
1809 U8 c2;
1810 char *e;
1811 I32 tmp = 1; /* Scratch variable? */
ba44c216 1812 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1813 UV utf8_fold_flags = 0;
f9176b44 1814 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1815 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1816 with a result inverts that result, as 0^1 =
1817 1 and 1^1 = 0 */
1818 _char_class_number classnum;
1819
73104a1b 1820 RXi_GET_DECL(prog,progi);
2f7f8cb1 1821
73104a1b 1822 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1823
73104a1b
KW
1824 /* We know what class it must start with. */
1825 switch (OP(c)) {
a4525e78 1826 case ANYOFL:
780fcc9f 1827 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30
KW
1828
1829 if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
1830 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1831 }
1832
780fcc9f 1833 /* FALLTHROUGH */
ac44c12e 1834 case ANYOFD:
73104a1b
KW
1835 case ANYOF:
1836 if (utf8_target) {
1837 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1838 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b
KW
1839 }
1840 else {
1841 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1842 }
1843 break;
73104a1b 1844
098b07d5
KW
1845 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1846 assert(! is_utf8_pat);
924ba076 1847 /* FALLTHROUGH */
73104a1b 1848 case EXACTFA:
984e6dd1 1849 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1850 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1851 goto do_exactf_utf8;
1852 }
1853 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1854 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1855 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1856
2fdb7295
KW
1857 case EXACTF: /* This node only generated for non-utf8 patterns */
1858 assert(! is_utf8_pat);
73104a1b 1859 if (utf8_target) {
73104a1b
KW
1860 utf8_fold_flags = 0;
1861 goto do_exactf_utf8;
1862 }
1863 fold_array = PL_fold;
1864 folder = foldEQ;
1865 goto do_exactf_non_utf8;
1866
1867 case EXACTFL:
780fcc9f 1868 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 1869 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1870 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1871 goto do_exactf_utf8;
1872 }
1873 fold_array = PL_fold_locale;
1874 folder = foldEQ_locale;
1875 goto do_exactf_non_utf8;
3c760661 1876
73104a1b 1877 case EXACTFU_SS:
984e6dd1 1878 if (is_utf8_pat) {
73104a1b
KW
1879 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1880 }
1881 goto do_exactf_utf8;
16d951b7 1882
a4525e78
KW
1883 case EXACTFLU8:
1884 if (! utf8_target) { /* All code points in this node require
1885 UTF-8 to express. */
1886 break;
1887 }
613abc6d
KW
1888 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1889 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
1890 goto do_exactf_utf8;
1891
73104a1b 1892 case EXACTFU:
984e6dd1
DM
1893 if (is_utf8_pat || utf8_target) {
1894 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1895 goto do_exactf_utf8;
1896 }
fac1af77 1897
73104a1b
KW
1898 /* Any 'ss' in the pattern should have been replaced by regcomp,
1899 * so we don't have to worry here about this single special case
1900 * in the Latin1 range */
1901 fold_array = PL_fold_latin1;
1902 folder = foldEQ_latin1;
1903
924ba076 1904 /* FALLTHROUGH */
73104a1b 1905
c52b8b12 1906 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
1907 are no glitches with fold-length differences
1908 between the target string and pattern */
1909
1910 /* The idea in the non-utf8 EXACTF* cases is to first find the
1911 * first character of the EXACTF* node and then, if necessary,
1912 * case-insensitively compare the full text of the node. c1 is the
1913 * first character. c2 is its fold. This logic will not work for
1914 * Unicode semantics and the german sharp ss, which hence should
1915 * not be compiled into a node that gets here. */
1916 pat_string = STRING(c);
1917 ln = STR_LEN(c); /* length to match in octets/bytes */
1918
1919 /* We know that we have to match at least 'ln' bytes (which is the
1920 * same as characters, since not utf8). If we have to match 3
1921 * characters, and there are only 2 availabe, we know without
1922 * trying that it will fail; so don't start a match past the
1923 * required minimum number from the far end */
ea3daa5d 1924 e = HOP3c(strend, -((SSize_t)ln), s);
73104a1b 1925
02d5137b 1926 if (reginfo->intuit && e < s) {
73104a1b
KW
1927 e = s; /* Due to minlen logic of intuit() */
1928 }
fac1af77 1929
73104a1b
KW
1930 c1 = *pat_string;
1931 c2 = fold_array[c1];
1932 if (c1 == c2) { /* If char and fold are the same */
1933 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1934 }
1935 else {
1936 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1937 }
1938 break;
fac1af77 1939
c52b8b12
KW
1940 do_exactf_utf8:
1941 {
73104a1b
KW
1942 unsigned expansion;
1943
1944 /* If one of the operands is in utf8, we can't use the simpler folding
1945 * above, due to the fact that many different characters can have the
1946 * same fold, or portion of a fold, or different- length fold */
1947 pat_string = STRING(c);
1948 ln = STR_LEN(c); /* length to match in octets/bytes */
1949 pat_end = pat_string + ln;
984e6dd1 1950 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
1951 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1952 : ln;
1953
1954 /* We have 'lnc' characters to match in the pattern, but because of
1955 * multi-character folding, each character in the target can match
1956 * up to 3 characters (Unicode guarantees it will never exceed
1957 * this) if it is utf8-encoded; and up to 2 if not (based on the
1958 * fact that the Latin 1 folds are already determined, and the
1959 * only multi-char fold in that range is the sharp-s folding to
1960 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1961 * string character. Adjust lnc accordingly, rounding up, so that
1962 * if we need to match at least 4+1/3 chars, that really is 5. */
1963 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1964 lnc = (lnc + expansion - 1) / expansion;
1965
1966 /* As in the non-UTF8 case, if we have to match 3 characters, and
1967 * only 2 are left, it's guaranteed to fail, so don't start a
1968 * match that would require us to go beyond the end of the string
1969 */
ea3daa5d 1970 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 1971
02d5137b 1972 if (reginfo->intuit && e < s) {
73104a1b
KW
1973 e = s; /* Due to minlen logic of intuit() */
1974 }
0658cdde 1975
73104a1b
KW
1976 /* XXX Note that we could recalculate e to stop the loop earlier,
1977 * as the worst case expansion above will rarely be met, and as we
1978 * go along we would usually find that e moves further to the left.
1979 * This would happen only after we reached the point in the loop
1980 * where if there were no expansion we should fail. Unclear if
1981 * worth the expense */
1982
1983 while (s <= e) {
1984 char *my_strend= (char *)strend;
1985 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 1986 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 1987 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
1988 {
1989 goto got_it;
1990 }
1991 s += (utf8_target) ? UTF8SKIP(s) : 1;
1992 }
1993 break;
1994 }
236d82fd 1995
73104a1b 1996 case BOUNDL:
780fcc9f 1997 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 1998 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
1999 if (! IN_UTF8_CTYPE_LOCALE) {
2000 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2001 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2002 }
64935bc6
KW
2003 goto do_boundu;
2004 }
2005
236d82fd 2006 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b 2007 break;
64935bc6 2008
73104a1b 2009 case NBOUNDL:
780fcc9f 2010 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2011 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2012 if (! IN_UTF8_CTYPE_LOCALE) {
2013 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2014 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2015 }
64935bc6
KW
2016 goto do_nboundu;
2017 }
2018
236d82fd 2019 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
73104a1b 2020 break;
64935bc6
KW
2021
2022 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2023 meaning */
2024 assert(FLAGS(c) == TRADITIONAL_BOUND);
2025
236d82fd 2026 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b 2027 break;
64935bc6
KW
2028
2029 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2030 meaning */
2031 assert(FLAGS(c) == TRADITIONAL_BOUND);
2032
44129e46 2033 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2034 break;
64935bc6
KW
2035
2036 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2037 meaning */
2038 assert(FLAGS(c) == TRADITIONAL_BOUND);
2039
236d82fd 2040 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
73104a1b 2041 break;
64935bc6
KW
2042
2043 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2044 meaning */
2045 assert(FLAGS(c) == TRADITIONAL_BOUND);
2046
44129e46 2047 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2048 break;
64935bc6 2049
73104a1b 2050 case NBOUNDU:
64935bc6
KW
2051 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2052 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2053 break;
2054 }
2055
2056 do_nboundu:
2057
2058 to_complement = 1;
2059 /* FALLTHROUGH */
2060
2061 case BOUNDU:
2062 do_boundu:
2063 switch((bound_type) FLAGS(c)) {
2064 case TRADITIONAL_BOUND:
2065 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2066 break;
2067 case GCB_BOUND:
2068 if (s == reginfo->strbeg) { /* GCB always matches at begin and
2069 end */
2070 if (to_complement ^ cBOOL(reginfo->intuit
2071 || regtry(reginfo, &s)))
2072 {
2073 goto got_it;
2074 }
2075 s += (utf8_target) ? UTF8SKIP(s) : 1;
2076 }
2077
2078 if (utf8_target) {
85e5f08b 2079 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2080 reghop3((U8*)s, -1,
2081 (U8*)(reginfo->strbeg)),
2082 (U8*) reginfo->strend);
2083 while (s < strend) {
85e5f08b 2084 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6
KW
2085 (U8*) reginfo->strend);
2086 if (to_complement ^ isGCB(before, after)) {
2087 if (reginfo->intuit || regtry(reginfo, &s)) {
2088 goto got_it;
2089 }
2090 before = after;
2091 }
2092 s += UTF8SKIP(s);
2093 }
2094 }
2095 else { /* Not utf8. Everything is a GCB except between CR and
2096 LF */
2097 while (s < strend) {
2098 if (to_complement ^ (UCHARAT(s - 1) != '\r'
2099 || UCHARAT(s) != '\n'))
2100 {
2101 if (reginfo->intuit || regtry(reginfo, &s)) {
2102 goto got_it;
2103 }
2104 s++;
2105 }
2106 }
2107 }
2108
2109 if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
2110 goto got_it;
2111 }
2112 break;
ae3bb8ea 2113
06ae2722
KW
2114 case SB_BOUND:
2115 if (s == reginfo->strbeg) { /* SB always matches at beginning */
2116 if (to_complement
2117 ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
2118 {
2119 goto got_it;
2120 }
2121
2122 /* Didn't match. Go try at the next position */
2123 s += (utf8_target) ? UTF8SKIP(s) : 1;
2124 }
2125
2126 if (utf8_target) {
85e5f08b 2127 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2128 -1,
2129 (U8*)(reginfo->strbeg)),
2130 (U8*) reginfo->strend);
2131 while (s < strend) {
85e5f08b 2132 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722
KW
2133 (U8*) reginfo->strend);
2134 if (to_complement ^ isSB(before,
2135 after,
2136 (U8*) reginfo->strbeg,
2137 (U8*) s,
2138 (U8*) reginfo->strend,
2139 utf8_target))
2140 {
2141 if (reginfo->intuit || regtry(reginfo, &s)) {
2142 goto got_it;
2143 }
2144 before = after;
2145 }
2146 s += UTF8SKIP(s);
2147 }
2148 }
2149 else { /* Not utf8. */
85e5f08b 2150 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2151 while (s < strend) {
85e5f08b 2152 SB_enum after = getSB_VAL_CP((U8) *s);
06ae2722
KW
2153 if (to_complement ^ isSB(before,
2154 after,
2155 (U8*) reginfo->strbeg,
2156 (U8*) s,
2157 (U8*) reginfo->strend,
2158 utf8_target))
2159 {
2160 if (reginfo->intuit || regtry(reginfo, &s)) {
2161 goto got_it;
2162 }
2163 before = after;
2164 }
2165 s++;
2166 }
2167 }
2168
2169 /* Here are at the final position in the target string. The SB
2170 * value is always true here, so matches, depending on other
2171 * constraints */
2172 if (to_complement ^ cBOOL(reginfo->intuit
2173 || regtry(reginfo, &s)))
2174 {
2175 goto got_it;
2176 }
2177
2178 break;
2179
ae3bb8ea
KW
2180 case WB_BOUND:
2181 if (s == reginfo->strbeg) {
2182 if (to_complement ^ cBOOL(reginfo->intuit
2183 || regtry(reginfo, &s)))
2184 {
2185 goto got_it;
2186 }
2187 s += (utf8_target) ? UTF8SKIP(s) : 1;
2188 }
2189
2190 if (utf8_target) {
2191 /* We are at a boundary between char_sub_0 and char_sub_1.
2192 * We also keep track of the value for char_sub_-1 as we
2193 * loop through the line. Context may be needed to make a
2194 * determination, and if so, this can save having to
2195 * recalculate it */
85e5f08b
KW
2196 WB_enum previous = WB_UNKNOWN;
2197 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2198 reghop3((U8*)s,
2199 -1,
2200 (U8*)(reginfo->strbeg)),
2201 (U8*) reginfo->strend);
2202 while (s < strend) {
85e5f08b 2203 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea
KW
2204 (U8*) reginfo->strend);
2205 if (to_complement ^ isWB(previous,
2206 before,
2207 after,
2208 (U8*) reginfo->strbeg,
2209 (U8*) s,
2210 (U8*) reginfo->strend,
2211 utf8_target))
2212 {
2213 if (reginfo->intuit || regtry(reginfo, &s)) {
2214 goto got_it;
2215 }
2216 previous = before;
2217 before = after;
2218 }
2219 s += UTF8SKIP(s);
2220 }
2221 }
2222 else { /* Not utf8. */
85e5f08b
KW
2223 WB_enum previous = WB_UNKNOWN;
2224 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2225 while (s < strend) {
85e5f08b 2226 WB_enum after = getWB_VAL_CP((U8) *s);
ae3bb8ea
KW
2227 if (to_complement ^ isWB(previous,
2228 before,
2229 after,
2230 (U8*) reginfo->strbeg,
2231 (U8*) s,
2232 (U8*) reginfo->strend,
2233 utf8_target))
2234 {
2235 if (reginfo->intuit || regtry(reginfo, &s)) {
2236 goto got_it;
2237 }
2238 previous = before;
2239 before = after;
2240 }
2241 s++;
2242 }
2243 }
2244
2245 if (to_complement ^ cBOOL(reginfo->intuit
2246 || regtry(reginfo, &s)))
2247 {
2248 goto got_it;
2249 }
2250
2251 break;
64935bc6 2252 }
73104a1b 2253 break;
64935bc6 2254
73104a1b
KW
2255 case LNBREAK:
2256 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2257 is_LNBREAK_latin1_safe(s, strend)
2258 );
2259 break;
3018b823
KW
2260
2261 /* The argument to all the POSIX node types is the class number to pass to
2262 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2263
2264 case NPOSIXL:
2265 to_complement = 1;
2266 /* FALLTHROUGH */
2267
2268 case POSIXL:
780fcc9f 2269 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
2270 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2271 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2272 break;
3018b823
KW
2273
2274 case NPOSIXD:
2275 to_complement = 1;
2276 /* FALLTHROUGH */
2277
2278 case POSIXD:
2279 if (utf8_target) {
2280 goto posix_utf8;
2281 }
2282 goto posixa;
2283
2284 case NPOSIXA:
2285 if (utf8_target) {
2286 /* The complement of something that matches only ASCII matches all
837226c8
KW
2287 * non-ASCII, plus everything in ASCII that isn't in the class. */
2288 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
3018b823
KW
2289 || ! _generic_isCC_A(*s, FLAGS(c)));
2290 break;
2291 }
2292
2293 to_complement = 1;
2294 /* FALLTHROUGH */
2295
73104a1b 2296 case POSIXA:
3018b823 2297 posixa:
73104a1b 2298 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
2299 * byte invariant character. */
2300 REXEC_FBC_CLASS_SCAN(
2301 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2302 break;
3018b823
KW
2303
2304 case NPOSIXU:
2305 to_complement = 1;
2306 /* FALLTHROUGH */
2307
2308 case POSIXU:
2309 if (! utf8_target) {
2310 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2311 FLAGS(c))));
2312 }
2313 else {
2314
c52b8b12 2315 posix_utf8:
3018b823
KW
2316 classnum = (_char_class_number) FLAGS(c);
2317 if (classnum < _FIRST_NON_SWASH_CC) {
2318 while (s < strend) {
2319
2320 /* We avoid loading in the swash as long as possible, but
2321 * should we have to, we jump to a separate loop. This
2322 * extra 'if' statement is what keeps this code from being
2323 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2324 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2325 goto found_above_latin1;
2326 }
2327 if ((UTF8_IS_INVARIANT(*s)
2328 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2329 classnum)))
2330 || (UTF8_IS_DOWNGRADEABLE_START(*s)
2331 && to_complement ^ cBOOL(
94bb8c36
KW
2332 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2333 *(s + 1)),
3018b823
KW
2334 classnum))))
2335 {
02d5137b 2336 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
2337 goto got_it;
2338 else {
2339 tmp = doevery;
2340 }
2341 }
2342 else {
2343 tmp = 1;
2344 }
2345 s += UTF8SKIP(s);
2346 }
2347 }
2348 else switch (classnum) { /* These classes are implemented as
2349 macros */
779cf272 2350 case _CC_ENUM_SPACE:
3018b823
KW
2351 REXEC_FBC_UTF8_CLASS_SCAN(
2352 to_complement ^ cBOOL(isSPACE_utf8(s)));
2353 break;
2354
2355 case _CC_ENUM_BLANK:
2356 REXEC_FBC_UTF8_CLASS_SCAN(
2357 to_complement ^ cBOOL(isBLANK_utf8(s)));
2358 break;
2359
2360 case _CC_ENUM_XDIGIT:
2361 REXEC_FBC_UTF8_CLASS_SCAN(
2362 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2363 break;
2364
2365 case _CC_ENUM_VERTSPACE:
2366 REXEC_FBC_UTF8_CLASS_SCAN(
2367 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2368 break;
2369
2370 case _CC_ENUM_CNTRL:
2371 REXEC_FBC_UTF8_CLASS_SCAN(
2372 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2373 break;
2374
2375 default:
2376 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
e5964223 2377 NOT_REACHED; /* NOTREACHED */
3018b823
KW
2378 }
2379 }
2380 break;
2381
2382 found_above_latin1: /* Here we have to load a swash to get the result
2383 for the current code point */
2384 if (! PL_utf8_swash_ptrs[classnum]) {
2385 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2386 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2387 _core_swash_init("utf8",
2388 "",
2389 &PL_sv_undef, 1, 0,
2390 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2391 }
2392
2393 /* This is a copy of the loop above for swash classes, though using the
2394 * FBC macro instead of being expanded out. Since we've loaded the
2395 * swash, we don't have to check for that each time through the loop */
2396 REXEC_FBC_UTF8_CLASS_SCAN(
2397 to_complement ^ cBOOL(_generic_utf8(
2398 classnum,
2399 s,
2400 swash_fetch(PL_utf8_swash_ptrs[classnum],
2401 (U8 *) s, TRUE))));
73104a1b
KW
2402 break;
2403
2404 case AHOCORASICKC:
2405 case AHOCORASICK:
2406 {
2407 DECL_TRIE_TYPE(c);
2408 /* what trie are we using right now */
2409 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2410 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2411 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2412
2413 const char *last_start = strend - trie->minlen;
6148ee25 2414#ifdef DEBUGGING
73104a1b 2415 const char *real_start = s;
6148ee25 2416#endif
73104a1b
KW
2417 STRLEN maxlen = trie->maxlen;
2418 SV *sv_points;
2419 U8 **points; /* map of where we were in the input string
2420 when reading a given char. For ASCII this
2421 is unnecessary overhead as the relationship
2422 is always 1:1, but for Unicode, especially
2423 case folded Unicode this is not true. */
2424 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2425 U8 *bitmap=NULL;
2426
2427
2428 GET_RE_DEBUG_FLAGS_DECL;
2429
2430 /* We can't just allocate points here. We need to wrap it in
2431 * an SV so it gets freed properly if there is a croak while
2432 * running the match */
2433 ENTER;
2434 SAVETMPS;
2435 sv_points=newSV(maxlen * sizeof(U8 *));
2436 SvCUR_set(sv_points,
2437 maxlen * sizeof(U8 *));
2438 SvPOK_on(sv_points);
2439 sv_2mortal(sv_points);
2440 points=(U8**)SvPV_nolen(sv_points );
2441 if ( trie_type != trie_utf8_fold
2442 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2443 {
2444 if (trie->bitmap)
2445 bitmap=(U8*)trie->bitmap;
2446 else
2447 bitmap=(U8*)ANYOF_BITMAP(c);
2448 }
2449 /* this is the Aho-Corasick algorithm modified a touch
2450 to include special handling for long "unknown char" sequences.
2451 The basic idea being that we use AC as long as we are dealing
2452 with a possible matching char, when we encounter an unknown char
2453 (and we have not encountered an accepting state) we scan forward
2454 until we find a legal starting char.
2455 AC matching is basically that of trie matching, except that when
2456 we encounter a failing transition, we fall back to the current
2457 states "fail state", and try the current char again, a process
2458 we repeat until we reach the root state, state 1, or a legal
2459 transition. If we fail on the root state then we can either
2460 terminate if we have reached an accepting state previously, or
2461 restart the entire process from the beginning if we have not.
2462
2463 */
2464 while (s <= last_start) {
2465 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2466 U8 *uc = (U8*)s;
2467 U16 charid = 0;
2468 U32 base = 1;
2469 U32 state = 1;
2470 UV uvc = 0;
2471 STRLEN len = 0;
2472 STRLEN foldlen = 0;
2473 U8 *uscan = (U8*)NULL;
2474 U8 *leftmost = NULL;
2475#ifdef DEBUGGING
2476 U32 accepted_word= 0;
786e8c11 2477#endif
73104a1b
KW
2478 U32 pointpos = 0;
2479
2480 while ( state && uc <= (U8*)strend ) {
2481 int failed=0;
2482 U32 word = aho->states[ state ].wordnum;
2483
2484 if( state==1 ) {
2485 if ( bitmap ) {
2486 DEBUG_TRIE_EXECUTE_r(
2487 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2488 dump_exec_pos( (char *)uc, c, strend, real_start,
2489 (char *)uc, utf8_target );
2490 PerlIO_printf( Perl_debug_log,
2491 " Scanning for legal start char...\n");
2492 }
2493 );
2494 if (utf8_target) {
2495 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2496 uc += UTF8SKIP(uc);
2497 }
2498 } else {
2499 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2500 uc++;
2501 }
786e8c11 2502 }
73104a1b 2503 s= (char *)uc;
07be1b83 2504 }
73104a1b
KW
2505 if (uc >(U8*)last_start) break;
2506 }
2507
2508 if ( word ) {
2509 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2510 if (!leftmost || lpos < leftmost) {
2511 DEBUG_r(accepted_word=word);
2512 leftmost= lpos;
7016d6eb 2513 }
73104a1b 2514 if (base==0) break;
7016d6eb 2515
73104a1b
KW
2516 }
2517 points[pointpos++ % maxlen]= uc;
2518 if (foldlen || uc < (U8*)strend) {
2519 REXEC_TRIE_READ_CHAR(trie_type, trie,
2520 widecharmap, uc,
2521 uscan, len, uvc, charid, foldlen,
2522 foldbuf, uniflags);
2523 DEBUG_TRIE_EXECUTE_r({
2524 dump_exec_pos( (char *)uc, c, strend,
2525 real_start, s, utf8_target);
2526 PerlIO_printf(Perl_debug_log,
2527 " Charid:%3u CP:%4"UVxf" ",
2528 charid, uvc);
2529 });
2530 }
2531 else {
2532 len = 0;
2533 charid = 0;
2534 }
07be1b83 2535
73104a1b
KW
2536
2537 do {
6148ee25 2538#ifdef DEBUGGING
73104a1b 2539 word = aho->states[ state ].wordnum;
6148ee25 2540#endif
73104a1b
KW
2541 base = aho->states[ state ].trans.base;
2542
2543 DEBUG_TRIE_EXECUTE_r({
2544 if (failed)
2545 dump_exec_pos( (char *)uc, c, strend, real_start,
2546 s, utf8_target );
2547 PerlIO_printf( Perl_debug_log,
2548 "%sState: %4"UVxf", word=%"UVxf,
2549 failed ? " Fail transition to " : "",
2550 (UV)state, (UV)word);
2551 });
2552 if ( base ) {
2553 U32 tmp;
2554 I32 offset;
2555 if (charid &&
2556 ( ((offset = base + charid
2557 - 1 - trie->uniquecharcount)) >= 0)
2558 && ((U32)offset < trie->lasttrans)
2559 && trie->trans[offset].check == state
2560 && (tmp=trie->trans[offset].next))
2561 {
2562 DEBUG_TRIE_EXECUTE_r(
2563 PerlIO_printf( Perl_debug_log," - legal\n"));
2564 state = tmp;
2565 break;
07be1b83
YO
2566 }
2567 else {
786e8c11 2568 DEBUG_TRIE_EXECUTE_r(
73104a1b 2569 PerlIO_printf( Perl_debug_log," - fail\n"));
786e8c11 2570 failed = 1;
73104a1b 2571 state = aho->fail[state];
07be1b83 2572 }
07be1b83 2573 }
73104a1b
KW
2574 else {
2575 /* we must be accepting here */
2576 DEBUG_TRIE_EXECUTE_r(
2577 PerlIO_printf( Perl_debug_log," - accepting\n"));
2578 failed = 1;
2579 break;
786e8c11 2580 }
73104a1b
KW
2581 } while(state);
2582 uc += len;
2583 if (failed) {
2584 if (leftmost)
2585 break;
2586 if (!state) state = 1;
07be1b83 2587 }
73104a1b
KW
2588 }
2589 if ( aho->states[ state ].wordnum ) {
2590 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2591 if (!leftmost || lpos < leftmost) {
2592 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2593 leftmost = lpos;
07be1b83
YO
2594 }
2595 }
73104a1b
KW
2596 if (leftmost) {
2597 s = (char*)leftmost;
2598 DEBUG_TRIE_EXECUTE_r({
2599 PerlIO_printf(
2600 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2601 (UV)accepted_word, (IV)(s - real_start)
2602 );
2603 });
02d5137b 2604 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2605 FREETMPS;
2606 LEAVE;
2607 goto got_it;
2608 }
2609 s = HOPc(s,1);
2610 DEBUG_TRIE_EXECUTE_r({
2611 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2612 });
2613 } else {
2614 DEBUG_TRIE_EXECUTE_r(
2615 PerlIO_printf( Perl_debug_log,"No match.\n"));
2616 break;
2617 }
2618 }
2619 FREETMPS;
2620 LEAVE;
2621 }
2622 break;
2623 default:
2624 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
2625 }
2626 return 0;
2627 got_it:
2628 return s;
6eb5f6b9
JH
2629}
2630
60165aa4
DM
2631/* set RX_SAVED_COPY, RX_SUBBEG etc.
2632 * flags have same meanings as with regexec_flags() */
2633
749f4950
DM
2634static void
2635S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2636 char *strbeg,
2637 char *strend,
2638 SV *sv,
2639 U32 flags,
2640 bool utf8_target)
2641{
2642 struct regexp *const prog = ReANY(rx);
2643
60165aa4
DM
2644 if (flags & REXEC_COPY_STR) {
2645#ifdef PERL_ANY_COW
2646 if (SvCANCOW(sv)) {
2647 if (DEBUG_C_TEST) {
2648 PerlIO_printf(Perl_debug_log,
2649 "Copy on write: regexp capture, type %d\n",
2650 (int) SvTYPE(sv));
2651 }
5411a0e5
DM
2652 /* Create a new COW SV to share the match string and store
2653 * in saved_copy, unless the current COW SV in saved_copy
2654 * is valid and suitable for our purpose */
2655 if (( prog->saved_copy
2656 && SvIsCOW(prog->saved_copy)
2657 && SvPOKp(prog->saved_copy)
2658 && SvIsCOW(sv)
2659 && SvPOKp(sv)
2660 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2661 {
5411a0e5
DM
2662 /* just reuse saved_copy SV */
2663 if (RXp_MATCH_COPIED(prog)) {
2664 Safefree(prog->subbeg);
2665 RXp_MATCH_COPIED_off(prog);
2666 }
2667 }
2668 else {
2669 /* create new COW SV to share string */
a76b0e90
DM
2670 RX_MATCH_COPY_FREE(rx);
2671 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2672 }
5411a0e5
DM
2673 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2674 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2675 prog->sublen = strend - strbeg;
2676 prog->suboffset = 0;
2677 prog->subcoffset = 0;
2678 } else
2679#endif
2680 {
99a90e59
FC
2681 SSize_t min = 0;
2682 SSize_t max = strend - strbeg;
ea3daa5d 2683 SSize_t sublen;
60165aa4
DM
2684
2685 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2686 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2687 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2688 ) { /* don't copy $' part of string */
2689 U32 n = 0;
2690 max = -1;
2691 /* calculate the right-most part of the string covered
2692 * by a capture. Due to look-ahead, this may be to
2693 * the right of $&, so we have to scan all captures */
2694 while (n <= prog->lastparen) {
2695 if (prog->offs[n].end > max)
2696 max = prog->offs[n].end;
2697 n++;
2698 }
2699 if (max == -1)
2700 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2701 ? prog->offs[0].start
2702 : 0;
2703 assert(max >= 0 && max <= strend - strbeg);
2704 }
2705
2706 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2707 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2708 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2709 ) { /* don't copy $` part of string */
2710 U32 n = 0;
2711 min = max;
2712 /* calculate the left-most part of the string covered
2713 * by a capture. Due to look-behind, this may be to
2714 * the left of $&, so we have to scan all captures */
2715 while (min && n <= prog->lastparen) {
2716 if ( prog->offs[n].start != -1
2717 && prog->offs[n].start < min)
2718 {
2719 min = prog->offs[n].start;
2720 }
2721 n++;
2722 }
2723 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2724 && min > prog->offs[0].end
2725 )
2726 min = prog->offs[0].end;
2727
2728 }
2729
2730 assert(min >= 0 && min <= max && min <= strend - strbeg);
2731 sublen = max - min;
2732
2733 if (RX_MATCH_COPIED(rx)) {
2734 if (sublen > prog->sublen)
2735 prog->subbeg =
2736 (char*)saferealloc(prog->subbeg, sublen+1);
2737 }
2738 else
2739 prog->subbeg = (char*)safemalloc(sublen+1);
2740 Copy(strbeg + min, prog->subbeg, sublen, char);
2741 prog->subbeg[sublen] = '\0';
2742 prog->suboffset = min;
2743 prog->sublen = sublen;
2744 RX_MATCH_COPIED_on(rx);
2745 }
2746 prog->subcoffset = prog->suboffset;
2747 if (prog->suboffset && utf8_target) {
2748 /* Convert byte offset to chars.
2749 * XXX ideally should only compute this if @-/@+
2750 * has been seen, a la PL_sawampersand ??? */
2751
2752 /* If there's a direct correspondence between the
2753 * string which we're matching and the original SV,
2754 * then we can use the utf8 len cache associated with
2755 * the SV. In particular, it means that under //g,
2756 * sv_pos_b2u() will use the previously cached
2757 * position to speed up working out the new length of
2758 * subcoffset, rather than counting from the start of
2759 * the string each time. This stops
2760 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2761 * from going quadratic */
2762 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2763 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2764 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2765 else
2766 prog->subcoffset = utf8_length((U8*)strbeg,
2767 (U8*)(strbeg+prog->suboffset));
2768 }
2769 }
2770 else {
2771 RX_MATCH_COPY_FREE(rx);
2772 prog->subbeg = strbeg;
2773 prog->suboffset = 0;
2774 prog->subcoffset = 0;
2775 prog->sublen = strend - strbeg;
2776 }
2777}
2778
2779
2780
fae667d5 2781
6eb5f6b9
JH
2782/*
2783 - regexec_flags - match a regexp against a string
2784 */
2785I32
5aaab254 2786Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2787 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2788/* stringarg: the point in the string at which to begin matching */
2789/* strend: pointer to null at end of string */
2790/* strbeg: real beginning of string */
2791/* minend: end of match must be >= minend bytes after stringarg. */
2792/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2793 * itself is accessed via the pointers above */
2794/* data: May be used for some additional optimizations.
d058ec57 2795 Currently unused. */
a340edde 2796/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2797
6eb5f6b9 2798{
8d919b0a 2799 struct regexp *const prog = ReANY(rx);
5aaab254 2800 char *s;
eb578fdb 2801 regnode *c;
03c83e26 2802 char *startpos;
ea3daa5d
FC
2803 SSize_t minlen; /* must match at least this many chars */
2804 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2805 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2806 I32 multiline;
f8fc2ecf 2807 RXi_GET_DECL(prog,progi);
02d5137b
DM
2808 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2809 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2810 regexp_paren_pair *swap = NULL;
006f26b2 2811 I32 oldsave;
a3621e74
YO
2812 GET_RE_DEBUG_FLAGS_DECL;
2813
7918f24d 2814 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2815 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2816
2817 /* Be paranoid... */
3dc78631 2818 if (prog == NULL) {
6eb5f6b9 2819 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
2820 }
2821
6c3fea77 2822 DEBUG_EXECUTE_r(
03c83e26 2823 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2824 "Matching");
2825 );
8adc0f72 2826
b342a604
DM
2827 startpos = stringarg;
2828
58430ea8 2829 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2830 MAGIC *mg;
2831
fef7148b
DM
2832 /* set reginfo->ganch, the position where \G can match */
2833
2834 reginfo->ganch =
2835 (flags & REXEC_IGNOREPOS)
2836 ? stringarg /* use start pos rather than pos() */
3dc78631 2837 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2838 /* Defined pos(): */
2839 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2840 : strbeg; /* pos() not defined; use start of string */
2841
2842 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
7b0eb0b8 2843 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2844
03c83e26
DM
2845 /* in the presence of \G, we may need to start looking earlier in
2846 * the string than the suggested start point of stringarg:
0b2c2a84 2847 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2848 * offset, such as
2849 * /..\G/: gofs = 2
2850 * /ab|c\G/: gofs = 1
2851 * or if the minimum offset isn't known, then we have to go back
2852 * to the start of the string, e.g. /w+\G/
2853 */
2bfbe302 2854
8e1490ee 2855 if (prog->intflags & PREGf_ANCH_GPOS) {
2bfbe302
DM
2856 startpos = reginfo->ganch - prog->gofs;
2857 if (startpos <
2858 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2859 {
2860 DEBUG_r(PerlIO_printf(Perl_debug_log,
2861 "fail: ganch-gofs before earliest possible start\n"));
2862 return 0;
2863 }
2864 }
2865 else if (prog->gofs) {
b342a604
DM
2866 if (startpos - prog->gofs < strbeg)
2867 startpos = strbeg;
2868 else
2869 startpos -= prog->gofs;
03c83e26 2870 }
58430ea8 2871 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2872 startpos = strbeg;
03c83e26
DM
2873 }
2874
2875 minlen = prog->minlen;
b342a604 2876 if ((startpos + minlen) > strend || startpos < strbeg) {
03c83e26
DM
2877 DEBUG_r(PerlIO_printf(Perl_debug_log,
2878 "Regex match can't succeed, so not even tried\n"));
2879 return 0;
2880 }
2881
63a3746a
DM
2882 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2883 * which will call destuctors to reset PL_regmatch_state, free higher
2884 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2885 * regmatch_info_aux_eval */
2886
2887 oldsave = PL_savestack_ix;
2888
dfa77d06
DM
2889 s = startpos;
2890
e322109a 2891 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2892 && !(flags & REXEC_CHECKED))
2893 {
dfa77d06 2894 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 2895 flags, NULL);
dfa77d06 2896 if (!s)
7fadf4a7
DM
2897 return 0;
2898
e322109a 2899 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
2900 /* we can match based purely on the result of INTUIT.
2901 * Set up captures etc just for $& and $-[0]
2902 * (an intuit-only match wont have $1,$2,..) */
2903 assert(!prog->nparens);
d5e7783a
DM
2904
2905 /* s/// doesn't like it if $& is earlier than where we asked it to
2906 * start searching (which can happen on something like /.\G/) */
2907 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2908 && (s < stringarg))
2909 {
2910 /* this should only be possible under \G */
58430ea8 2911 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
2912 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2913 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2914 goto phooey;
2915 }
2916
7fadf4a7
DM
2917 /* match via INTUIT shouldn't have any captures.
2918 * Let @-, @+, $^N know */
2919 prog->lastparen = prog->lastcloseparen = 0;
2920 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
2921 prog->offs[0].start = s - strbeg;
2922 prog->offs[0].end = utf8_target
2923 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2924 : s - strbeg + prog->minlenret;
7fadf4a7 2925 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 2926 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
2927 strbeg, strend,
2928 sv, flags, utf8_target);
2929
7fadf4a7
DM
2930 return 1;
2931 }
2932 }
2933
6c3fea77 2934 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 2935
dfa77d06 2936 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
a3621e74 2937 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
a72c7584
JH
2938 "String too short [regexec_flags]...\n"));
2939 goto phooey;
1aa99e6b 2940 }
1de06328 2941
6eb5f6b9 2942 /* Check validity of program. */
f8fc2ecf 2943 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
2944 Perl_croak(aTHX_ "corrupted regexp program");
2945 }
2946
1738e041 2947 RX_MATCH_TAINTED_off(rx);
ab4e48c1 2948 RX_MATCH_UTF8_set(rx, utf8_target);
1738e041 2949
6c3fea77
DM
2950 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2951 reginfo->intuit = 0;
2952 reginfo->is_utf8_target = cBOOL(utf8_target);
02d5137b
DM
2953 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2954 reginfo->warned = FALSE;
9d9163fb 2955 reginfo->strbeg = strbeg;
02d5137b 2956 reginfo->sv = sv;
1cb48e53 2957 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
220db18a 2958 reginfo->strend = strend;
6eb5f6b9 2959 /* see how far we have to get to not match where we matched before */
fe3974be 2960 reginfo->till = stringarg + minend;
6eb5f6b9 2961
60779a30 2962 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
2963 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2964 S_cleanup_regmatch_info_aux has executed (registered by
2965 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2966 magic belonging to this SV.
2967 Not newSVsv, either, as it does not COW.
2968 */
2969 reginfo->sv = newSV(0);
4cba5ac0 2970 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
2971 SAVEFREESV(reginfo->sv);
2972 }
2973
331b2dcc
DM
2974 /* reserve next 2 or 3 slots in PL_regmatch_state:
2975 * slot N+0: may currently be in use: skip it
2976 * slot N+1: use for regmatch_info_aux struct
2977 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2978 * slot N+3: ready for use by regmatch()
2979 */
bf2039a9 2980
331b2dcc
DM
2981 {
2982 regmatch_state *old_regmatch_state;
2983 regmatch_slab *old_regmatch_slab;
2984 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2985
2986 /* on first ever match, allocate first slab */
2987 if (!PL_regmatch_slab) {
2988 Newx(PL_regmatch_slab, 1, regmatch_slab);
2989 PL_regmatch_slab->prev = NULL;
2990 PL_regmatch_slab->next = NULL;
2991 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2992 }
bf2039a9 2993
331b2dcc
DM
2994 old_regmatch_state = PL_regmatch_state;
2995 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 2996
331b2dcc
DM
2997 for (i=0; i <= max; i++) {
2998 if (i == 1)
2999 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3000 else if (i ==2)
3001 reginfo->info_aux_eval =
3002 reginfo->info_aux->info_aux_eval =
3003 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3004
331b2dcc
DM
3005 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3006 PL_regmatch_state = S_push_slab(aTHX);
3007 }
bf2039a9 3008
331b2dcc
DM
3009 /* note initial PL_regmatch_state position; at end of match we'll
3010 * pop back to there and free any higher slabs */
bf2039a9 3011
331b2dcc
DM
3012 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3013 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3014 reginfo->info_aux->poscache = NULL;
bf2039a9 3015
331b2dcc 3016 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 3017
331b2dcc
DM
3018 if ((prog->extflags & RXf_EVAL_SEEN))
3019 S_setup_eval_state(aTHX_ reginfo);
3020 else
3021 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 3022 }
d3aa529c 3023
6eb5f6b9 3024 /* If there is a "must appear" string, look for it. */
6eb5f6b9 3025
288b8c02 3026 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
3027 /* We have to be careful. If the previous successful match
3028 was from this regex we don't want a subsequent partially
3029 successful match to clobber the old results.
3030 So when we detect this possibility we add a swap buffer
d8da0584
KW
3031 to the re, and switch the buffer each match. If we fail,
3032 we switch it back; otherwise we leave it swapped.
e9105d30
GG
3033 */
3034 swap = prog->offs;
3035 /* do we need a save destructor here for eval dies? */
3036 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
495f47a5
DM
3037 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3038 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3039 PTR2UV(prog),
3040 PTR2UV(swap),
3041 PTR2UV(prog->offs)
3042 ));
c74340f9 3043 }
6eb5f6b9 3044
0fa70a06
DM
3045 /* Simplest case: anchored match need be tried only once, or with
3046 * MBOL, only at the beginning of each line.
3047 *
3048 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3049 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3050 * match at the start of the string then it won't match anywhere else
3051 * either; while with /.*.../, if it doesn't match at the beginning,
3052 * the earliest it could match is at the start of the next line */
3053
8e1490ee 3054 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
0fa70a06
DM
3055 char *end;
3056
3057 if (regtry(reginfo, &s))
6eb5f6b9 3058 goto got_it;
0fa70a06
DM
3059
3060 if (!(prog->intflags & PREGf_ANCH_MBOL))
3061 goto phooey;
3062
3063 /* didn't match at start, try at other newline positions */
3064
3065 if (minlen)
3066 dontbother = minlen - 1;
3067 end = HOP3c(strend, -dontbother, strbeg) - 1;
3068
3069 /* skip to next newline */
3070
3071 while (s <= end) { /* note it could be possible to match at the end of the string */
3072 /* NB: newlines are the same in unicode as they are in latin */
3073 if (*s++ != '\n')
3074 continue;
3075 if (prog->check_substr || prog->check_utf8) {
3076 /* note that with PREGf_IMPLICIT, intuit can only fail
3077 * or return the start position, so it's of limited utility.
3078 * Nevertheless, I made the decision that the potential for
3079 * quick fail was still worth it - DAPM */
3080 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3081 if (!s)
3082 goto phooey;
3083 }
3084 if (regtry(reginfo, &s))
3085 goto got_it;
3086 }
3087 goto phooey;
3088 } /* end anchored search */
3089
3090 if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 3091 {
a8430a8b
YO
3092 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3093 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
3094 /* For anchored \G, the only position it can match from is
3095 * (ganch-gofs); we already set startpos to this above; if intuit
3096 * moved us on from there, we can't possibly succeed */
3097 assert(startpos == reginfo->ganch - prog->gofs);
3098 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
3099 goto got_it;
3100 goto phooey;
3101 }
3102
3103 /* Messy cases: unanchored match. */
bbe252da 3104 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 3105 /* we have /x+whatever/ */
984e6dd1 3106 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 3107 char ch;
bf93d4cc
GS
3108#ifdef DEBUGGING
3109 int did_match = 0;
3110#endif
f2ed9b32 3111 if (utf8_target) {
7e0d5ad7
KW
3112 if (! prog->anchored_utf8) {
3113 to_utf8_substr(prog);
3114 }
3115 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 3116 REXEC_FBC_SCAN(
6eb5f6b9 3117 if (*s == ch) {
a3621e74 3118 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3119 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3120 s += UTF8SKIP(s);
3121 while (s < strend && *s == ch)
3122 s += UTF8SKIP(s);
3123 }
4cadc6a9 3124 );
7e0d5ad7 3125
6eb5f6b9
JH
3126 }
3127 else {
7e0d5ad7
KW
3128 if (! prog->anchored_substr) {
3129 if (! to_byte_substr(prog)) {
6b54ddc5 3130 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3131 }
3132 }
3133 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 3134 REXEC_FBC_SCAN(
6eb5f6b9 3135 if (*s == ch) {
a3621e74 3136 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3137 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3138 s++;
3139 while (s < strend && *s == ch)
3140 s++;
3141 }
4cadc6a9 3142 );
6eb5f6b9 3143 }
a3621e74 3144 DEBUG_EXECUTE_r(if (!did_match)
bf93d4cc 3145 PerlIO_printf(Perl_debug_log,
b7953727
JH
3146 "Did not find anchored character...\n")
3147 );
6eb5f6b9 3148 }
a0714e2c
SS
3149 else if (prog->anchored_substr != NULL
3150 || prog->anchored_utf8 != NULL
3151 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
3152 && prog->float_max_offset < strend - s)) {
3153 SV *must;
ea3daa5d
FC
3154 SSize_t back_max;
3155 SSize_t back_min;
33b8afdf 3156 char *last;
6eb5f6b9 3157 char *last1; /* Last position checked before */
bf93d4cc
GS
3158#ifdef DEBUGGING
3159 int did_match = 0;
3160#endif
33b8afdf 3161 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
3162 if (utf8_target) {
3163 if (! prog->anchored_utf8) {
3164 to_utf8_substr(prog);
3165 }
3166 must = prog->anchored_utf8;
3167 }
3168 else {
3169 if (! prog->anchored_substr) {
3170 if (! to_byte_substr(prog)) {
6b54ddc5 3171 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3172 }
3173 }
3174 must = prog->anchored_substr;
3175 }
33b8afdf
JH
3176 back_max = back_min = prog->anchored_offset;
3177 } else {
7e0d5ad7
KW
3178 if (utf8_target) {
3179 if (! prog->float_utf8) {
3180 to_utf8_substr(prog);
3181 }
3182 must = prog->float_utf8;
3183 }
3184 else {
3185 if (! prog->float_substr) {
3186 if (! to_byte_substr(prog)) {
6b54ddc5 3187 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3188 }
3189 }
3190 must = prog->float_substr;
3191 }
33b8afdf
JH
3192 back_max = prog->float_max_offset;
3193 back_min = prog->float_min_offset;
3194 }
1de06328 3195
1de06328
YO
3196 if (back_min<0) {
3197 last = strend;
3198 } else {
3199 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 3200 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
3201 - (SvTAIL(must) != 0) + back_min), strbeg);
3202 }
9d9163fb 3203 if (s > reginfo->strbeg)
6eb5f6b9
JH
3204 last1 = HOPc(s, -1);
3205 else
3206 last1 = s - 1; /* bogus */
3207
a0288114 3208 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 3209 check_substr==must. */
bf05793b 3210 dontbother = 0;
6eb5f6b9
JH
3211 strend = HOPc(strend, -dontbother);
3212 while ( (s <= last) &&
e50d57d4 3213 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 3214 (unsigned char*)strend, must,
c33e64f0 3215 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 3216 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
3217 if (HOPc(s, -back_max) > last1) {
3218 last1 = HOPc(s, -back_min);
3219 s = HOPc(s, -back_max);
3220 }
3221 else {
9d9163fb
DM
3222 char * const t = (last1 >= reginfo->strbeg)
3223 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
3224
3225 last1 = HOPc(s, -back_min);
52657f30 3226 s = t;
6eb5f6b9 3227 }
f2ed9b32 3228 if (utf8_target) {
6eb5f6b9 3229 while (s <= last1) {
02d5137b 3230 if (regtry(reginfo, &s))
6eb5f6b9 3231 goto got_it;
7016d6eb
DM
3232 if (s >= last1) {
3233 s++; /* to break out of outer loop */
3234 break;
3235 }
3236 s += UTF8SKIP(s);
6eb5f6b9
JH
3237 }
3238 }
3239 else {
3240 while (s <= last1) {
02d5137b 3241 if (regtry(reginfo, &s))
6eb5f6b9
JH
3242 goto got_it;
3243 s++;
3244 }
3245 }
3246 }
ab3bbdeb 3247 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 3248 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb
YO
3249 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3250 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
33b8afdf 3251 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 3252 ? "anchored" : "floating"),
ab3bbdeb
YO
3253 quoted, RE_SV_TAIL(must));
3254 });
6eb5f6b9
JH
3255 goto phooey;
3256 }
f8fc2ecf 3257 else if ( (c = progi->regstclass) ) {
f14c76ed 3258 if (minlen) {
f8fc2ecf 3259 const OPCODE op = OP(progi->regstclass);
66e933ab 3260 /* don't bother with what can't match */
33c28ab2 3261 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
f14c76ed
RGS
3262 strend = HOPc(strend, -(minlen - 1));
3263 }
a3621e74 3264 DEBUG_EXECUTE_r({
be8e71aa 3265 SV * const prop = sv_newmortal();
8b9781c9 3266 regprop(prog, prop, c, reginfo, NULL);
0df25f3d 3267 {
f2ed9b32 3268 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 3269 s,strend-s,60);
0df25f3d 3270 PerlIO_printf(Perl_debug_log,
1c8f8eb1 3271 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 3272 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 3273 quoted, (int)(strend - s));
0df25f3d 3274 }
ffc61ed2 3275 });
f9176b44 3276 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 3277 goto got_it;
07be1b83 3278 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
3279 }
3280 else {
3281 dontbother = 0;
a0714e2c 3282 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 3283 /* Trim the end. */
6af40bd7 3284 char *last= NULL;
33b8afdf 3285 SV* float_real;
c33e64f0
FC
3286 STRLEN len;
3287 const char *little;
33b8afdf 3288
7e0d5ad7
KW
3289 if (utf8_target) {
3290 if (! prog->float_utf8) {
3291 to_utf8_substr(prog);
3292 }
3293 float_real = prog->float_utf8;
3294 }
3295 else {
3296 if (! prog->float_substr) {
3297 if (! to_byte_substr(prog)) {
6b54ddc5 3298 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3299 }
3300 }
3301 float_real = prog->float_substr;
3302 }
d6a28714 3303
c33e64f0
FC
3304 little = SvPV_const(float_real, len);
3305 if (SvTAIL(float_real)) {
7f18ad16
KW
3306 /* This means that float_real contains an artificial \n on
3307 * the end due to the presence of something like this:
3308 * /foo$/ where we can match both "foo" and "foo\n" at the
3309 * end of the string. So we have to compare the end of the
3310 * string first against the float_real without the \n and
3311 * then against the full float_real with the string. We
3312 * have to watch out for cases where the string might be
3313 * smaller than the float_real or the float_real without
3314 * the \n. */
1a13b075
YO
3315 char *checkpos= strend - len;
3316 DEBUG_OPTIMISE_r(
3317 PerlIO_printf(Perl_debug_log,
3318 "%sChecking for float_real.%s\n",
3319 PL_colors[4], PL_colors[5]));
3320 if (checkpos + 1 < strbeg) {
7f18ad16
KW
3321 /* can't match, even if we remove the trailing \n
3322 * string is too short to match */
1a13b075
YO
3323 DEBUG_EXECUTE_r(
3324 PerlIO_printf(Perl_debug_log,
3325 "%sString shorter than required trailing substring, cannot match.%s\n",
3326 PL_colors[4], PL_colors[5]));
3327 goto phooey;
3328 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
3329 /* can match, the end of the string matches without the
3330 * "\n" */
1a13b075
YO
3331 last = checkpos + 1;
3332 } else if (checkpos < strbeg) {
7f18ad16
KW
3333 /* cant match, string is too short when the "\n" is
3334 * included */
1a13b075
YO
3335 DEBUG_EXECUTE_r(
3336 PerlIO_printf(Perl_debug_log,
3337 "%sString does not contain required trailing substring, cannot match.%s\n",
3338 PL_colors[4], PL_colors[5]));
3339 goto phooey;
3340 } else if (!multiline) {
7f18ad16
KW
3341 /* non multiline match, so compare with the "\n" at the
3342 * end of the string */
1a13b075
YO
3343 if (memEQ(checkpos, little, len)) {
3344 last= checkpos;
3345 } else {
3346 DEBUG_EXECUTE_r(
3347 PerlIO_printf(Perl_debug_log,
3348 "%sString does not contain required trailing substring, cannot match.%s\n",
3349 PL_colors[4], PL_colors[5]));
3350 goto phooey;
3351 }
3352 } else {
7f18ad16
KW
3353 /* multiline match, so we have to search for a place
3354 * where the full string is located */
d6a28714 3355 goto find_last;
1a13b075 3356 }
c33e64f0 3357 } else {
d6a28714 3358 find_last:
9041c2e3 3359 if (len)
d6a28714 3360 last = rninstr(s, strend, little, little + len);
b8c5462f 3361 else
a0288114 3362 last = strend; /* matching "$" */
b8c5462f 3363 }
6af40bd7 3364 if (!last) {
7f18ad16
KW
3365 /* at one point this block contained a comment which was
3366 * probably incorrect, which said that this was a "should not
3367 * happen" case. Even if it was true when it was written I am
3368 * pretty sure it is not anymore, so I have removed the comment
3369 * and replaced it with this one. Yves */
6bda09f9
YO
3370 DEBUG_EXECUTE_r(
3371 PerlIO_printf(Perl_debug_log,
b729e729
YO
3372 "%sString does not contain required substring, cannot match.%s\n",
3373 PL_colors[4], PL_colors[5]
6af40bd7
YO
3374 ));
3375 goto phooey;
bf93d4cc 3376 }
d6a28714
JH
3377 dontbother = strend - last + prog->float_min_offset;
3378 }
3379 if (minlen && (dontbother < minlen))
3380 dontbother = minlen - 1;
3381 strend -= dontbother; /* this one's always in bytes! */
3382 /* We don't know much -- general case. */
f2ed9b32 3383 if (utf8_target) {
d6a28714 3384 for (;;) {
02d5137b 3385 if (regtry(reginfo, &s))
d6a28714
JH
3386 goto got_it;
3387 if (s >= strend)
3388 break;
b8c5462f 3389 s += UTF8SKIP(s);
d6a28714
JH
3390 };
3391 }
3392 else {
3393 do {
02d5137b 3394 if (regtry(reginfo, &s))
d6a28714
JH
3395 goto got_it;
3396 } while (s++ < strend);
3397 }
3398 }
3399
3400 /* Failure. */
3401 goto phooey;
3402
7b52d656 3403 got_it:
d5e7783a
DM
3404 /* s/// doesn't like it if $& is earlier than where we asked it to
3405 * start searching (which can happen on something like /.\G/) */
3406 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3407 && (prog->offs[0].start < stringarg - strbeg))
3408 {
3409 /* this should only be possible under \G */
58430ea8 3410 assert(prog->intflags & PREGf_GPOS_SEEN);
d5e7783a
DM
3411 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3412 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3413 goto phooey;
3414 }
3415
495f47a5
DM
3416 DEBUG_BUFFERS_r(
3417 if (swap)
3418 PerlIO_printf(Perl_debug_log,
3419 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3420 PTR2UV(prog),
3421 PTR2UV(swap)
3422 );
3423 );
e9105d30 3424 Safefree(swap);
d6a28714 3425
bf2039a9
DM
3426 /* clean up; this will trigger destructors that will free all slabs
3427 * above the current one, and cleanup the regmatch_info_aux
3428 * and regmatch_info_aux_eval sructs */
8adc0f72 3429
006f26b2
DM
3430 LEAVE_SCOPE(oldsave);
3431
5daac39c
NC
3432 if (RXp_PAREN_NAMES(prog))
3433 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
3434
3435 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3436 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3437 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3438 strbeg, reginfo->strend,
3439 sv, flags, utf8_target);
9041c2e3 3440
d6a28714
JH
3441 return 1;
3442
7b52d656 3443 phooey:
a3621e74 3444 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
e4584336 3445 PL_colors[4], PL_colors[5]));
8adc0f72 3446
bf2039a9
DM
3447 /* clean up; this will trigger destructors that will free all slabs
3448 * above the current one, and cleanup the regmatch_info_aux
3449 * and regmatch_info_aux_eval sructs */
8adc0f72 3450
006f26b2
DM
3451 LEAVE_SCOPE(oldsave);
3452
e9105d30 3453 if (swap) {
c74340f9 3454 /* we failed :-( roll it back */
495f47a5
DM
3455 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3456 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3457 PTR2UV(prog),
3458 PTR2UV(prog->offs),
3459 PTR2UV(swap)
3460 ));
e9105d30
GG
3461 Safefree(prog->offs);
3462 prog->offs = swap;
3463 }
d6a28714
JH
3464 return 0;
3465}
3466
6bda09f9 3467
b3d298be 3468/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3469 * Do inc before dec, in case old and new rex are the same */
baa60164 3470#define SET_reg_curpm(Re2) \
bf2039a9 3471 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3472 (void)ReREFCNT_inc(Re2); \
3473 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3474 PM_SETRE((PL_reg_curpm), (Re2)); \
3475 }
3476
3477
d6a28714
JH
3478/*
3479 - regtry - try match at specific point
3480 */
3481STATIC I32 /* 0 failure, 1 success */
f73aaa43 3482S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3483{
d6a28714 3484 CHECKPOINT lastcp;
288b8c02 3485 REGEXP *const rx = reginfo->prog;
8d919b0a 3486 regexp *const prog = ReANY(rx);
99a90e59 3487 SSize_t result;
f8fc2ecf 3488 RXi_GET_DECL(prog,progi);
a3621e74 3489 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3490
3491 PERL_ARGS_ASSERT_REGTRY;
3492
24b23f37 3493 reginfo->cutpoint=NULL;
d6a28714 3494
9d9163fb 3495 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3496 prog->lastparen = 0;
03994de8 3497 prog->lastcloseparen = 0;
d6a28714
JH
3498
3499 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3500 to do this again and again, prog->lastparen should take care of
3dd2943c 3501 this! --ilya*/
dafc8851
JH
3502
3503 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3504 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3505 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3506 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3507 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3508 * Meanwhile, this code *is* needed for the
daf18116
JH
3509 * above-mentioned test suite tests to succeed. The common theme
3510 * on those tests seems to be returning null fields from matches.
225593e1 3511 * --jhi updated by dapm */
dafc8851 3512#if 1
d6a28714 3513 if (prog->nparens) {
b93070ed 3514 regexp_paren_pair *pp = prog->offs;
eb578fdb 3515 I32 i;
b93070ed 3516 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3517 ++pp;
3518 pp->start = -1;
3519 pp->end = -1;
d6a28714
JH
3520 }
3521 }
dafc8851 3522#endif
02db2b7b 3523 REGCP_SET(lastcp);
f73aaa43
DM
3524 result = regmatch(reginfo, *startposp, progi->program + 1);
3525 if (result != -1) {
3526 prog->offs[0].end = result;
d6a28714
JH
3527 return 1;
3528 }
24b23f37 3529 if (reginfo->cutpoint)
f73aaa43 3530 *startposp= reginfo->cutpoint;
02db2b7b 3531 REGCP_UNWIND(lastcp);
d6a28714
JH
3532 return 0;
3533}
3534
02db2b7b 3535
8ba1375e
MJD
3536#define sayYES goto yes
3537#define sayNO goto no
262b90c4 3538#define sayNO_SILENT goto no_silent
8ba1375e 3539
f9f4320a
YO
3540/* we dont use STMT_START/END here because it leads to
3541 "unreachable code" warnings, which are bogus, but distracting. */
3542#define CACHEsayNO \
c476f425 3543 if (ST.cache_mask) \
2ac8ff4b 3544 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3545 sayNO
3298f257 3546
a3621e74 3547/* this is used to determine how far from the left messages like
265c4333
YO
3548 'failed...' are printed. It should be set such that messages
3549 are inline with the regop output that created them.
a3621e74 3550*/
265c4333 3551#define REPORT_CODE_OFF 32
a3621e74
YO
3552
3553
40a82448
DM
3554#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3555#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3556#define CHRTEST_NOT_A_CP_1 -999
3557#define CHRTEST_NOT_A_CP_2 -998
9e137952 3558
5d9a96ca
DM
3559/* grab a new slab and return the first slot in it */
3560
3561STATIC regmatch_state *
3562S_push_slab(pTHX)
3563{
a35a87e7 3564#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
3565 dMY_CXT;
3566#endif
5d9a96ca
DM
3567 regmatch_slab *s = PL_regmatch_slab->next;
3568 if (!s) {
3569 Newx(s, 1, regmatch_slab);
3570 s->prev = PL_regmatch_slab;
3571 s->next = NULL;
3572 PL_regmatch_slab->next = s;
3573 }
3574 PL_regmatch_slab = s;
86545054 3575 return SLAB_FIRST(s);
5d9a96ca 3576}
5b47454d 3577
95b24440 3578
40a82448
DM
3579/* push a new state then goto it */
3580
4d5016e5
DM
3581#define PUSH_STATE_GOTO(state, node, input) \
3582 pushinput = input; \
40a82448
DM
3583 scan = node; \
3584 st->resume_state = state; \
3585 goto push_state;
3586
3587/* push a new state with success backtracking, then goto it */
3588
4d5016e5
DM
3589#define PUSH_YES_STATE_GOTO(state, node, input) \
3590 pushinput = input; \
40a82448
DM
3591 scan = node; \
3592 st->resume_state = state; \
3593 goto push_yes_state;
3594
aa283a38 3595
aa283a38 3596
4d5016e5 3597
d6a28714 3598/*
95b24440 3599
bf1f174e
DM
3600regmatch() - main matching routine
3601
3602This is basically one big switch statement in a loop. We execute an op,
3603set 'next' to point the next op, and continue. If we come to a point which
3604we may need to backtrack to on failure such as (A|B|C), we push a
3605backtrack state onto the backtrack stack. On failure, we pop the top
3606state, and re-enter the loop at the state indicated. If there are no more
3607states to pop, we return failure.
3608
3609Sometimes we also need to backtrack on success; for example /A+/, where
3610after successfully matching one A, we need to go back and try to
3611match another one; similarly for lookahead assertions: if the assertion
3612completes successfully, we backtrack to the state just before the assertion
3613and then carry on. In these cases, the pushed state is marked as
3614'backtrack on success too'. This marking is in fact done by a chain of
3615pointers, each pointing to the previous 'yes' state. On success, we pop to
3616the nearest yes state, discarding any intermediate failure-only states.
3617Sometimes a yes state is pushed just to force some cleanup code to be
3618called at the end of a successful match or submatch; e.g. (??{$re}) uses
3619it to free the inner regex.
3620
3621Note that failure backtracking rewinds the cursor position, while
3622success backtracking leaves it alone.
3623
3624A pattern is complete when the END op is executed, while a subpattern
3625such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3626ops trigger the "pop to last yes state if any, otherwise return true"
3627behaviour.
3628
3629A common convention in this function is to use A and B to refer to the two
3630subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3631the subpattern to be matched possibly multiple times, while B is the entire
3632rest of the pattern. Variable and state names reflect this convention.
3633
3634The states in the main switch are the union of ops and failure/success of
3635substates associated with with that op. For example, IFMATCH is the op
3636that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3637'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3638successfully matched A and IFMATCH_A_fail is a state saying that we have
3639just failed to match A. Resume states always come in pairs. The backtrack
3640state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3641at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3642on success or failure.
3643
3644The struct that holds a backtracking state is actually a big union, with
3645one variant for each major type of op. The variable st points to the
3646top-most backtrack struct. To make the code clearer, within each
3647block of code we #define ST to alias the relevant union.
3648
3649Here's a concrete example of a (vastly oversimplified) IFMATCH
3650implementation:
3651
3652 switch (state) {
3653 ....
3654
3655#define ST st->u.ifmatch
3656
3657 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3658 ST.foo = ...; // some state we wish to save
95b24440 3659 ...
bf1f174e
DM
3660 // push a yes backtrack state with a resume value of
3661 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3662 // first node of A:
4d5016e5 3663 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3664 // NOTREACHED
3665
3666 case IFMATCH_A: // we have successfully executed A; now continue with B
3667 next = B;
3668 bar = ST.foo; // do something with the preserved value
3669 break;
3670
3671 case IFMATCH_A_fail: // A failed, so the assertion failed
3672 ...; // do some housekeeping, then ...
3673 sayNO; // propagate the failure
3674
3675#undef ST
95b24440 3676
bf1f174e
DM
3677 ...
3678 }
95b24440 3679
bf1f174e
DM
3680For any old-timers reading this who are familiar with the old recursive
3681approach, the code above is equivalent to:
95b24440 3682
bf1f174e
DM
3683 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3684 {
3685 int foo = ...
95b24440 3686 ...
bf1f174e
DM
3687 if (regmatch(A)) {
3688 next = B;
3689 bar = foo;
3690 break;
95b24440 3691 }
bf1f174e
DM
3692 ...; // do some housekeeping, then ...
3693 sayNO; // propagate the failure
95b24440 3694 }
bf1f174e
DM
3695
3696The topmost backtrack state, pointed to by st, is usually free. If you
3697want to claim it, populate any ST.foo fields in it with values you wish to
3698save, then do one of
3699
4d5016e5
DM
3700 PUSH_STATE_GOTO(resume_state, node, newinput);
3701 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3702
3703which sets that backtrack state's resume value to 'resume_state', pushes a
3704new free entry to the top of the backtrack stack, then goes to 'node'.
3705On backtracking, the free slot is popped, and the saved state becomes the
3706new free state. An ST.foo field in this new top state can be temporarily
3707accessed to retrieve values, but once the main loop is re-entered, it
3708becomes available for reuse.
3709
3710Note that the depth of the backtrack stack constantly increases during the
3711left-to-right execution of the pattern, rather than going up and down with
3712the pattern nesting. For example the stack is at its maximum at Z at the
3713end of the pattern, rather than at X in the following:
3714
3715 /(((X)+)+)+....(Y)+....Z/
3716
3717The only exceptions to this are lookahead/behind assertions and the cut,
3718(?>A), which pop all the backtrack states associated with A before
3719continuing.
3720
486ec47a 3721Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3722PL_regmatch_state and st always point to the currently active state,
3723and PL_regmatch_slab points to the slab currently containing
3724PL_regmatch_state. The first time regmatch() is called, the first slab is
3725allocated, and is never freed until interpreter destruction. When the slab
3726is full, a new one is allocated and chained to the end. At exit from
3727regmatch(), slabs allocated since entry are freed.
3728
3729*/
95b24440 3730
40a82448 3731
5bc10b2c 3732#define DEBUG_STATE_pp(pp) \
265c4333 3733 DEBUG_STATE_r({ \
baa60164 3734 DUMP_EXEC_POS(locinput, scan, utf8_target); \
5bc10b2c 3735 PerlIO_printf(Perl_debug_log, \
5d458dd8 3736 " %*s"pp" %s%s%s%s%s\n", \
5bc10b2c 3737 depth*2, "", \
baa60164 3738 PL_reg_name[st->resume_state], \
5d458dd8
YO
3739 ((st==yes_state||st==mark_state) ? "[" : ""), \
3740 ((st==yes_state) ? "Y" : ""), \
3741 ((st==mark_state) ? "M" : ""), \
3742 ((st==yes_state||st==mark_state) ? "]" : "") \
3743 ); \
265c4333 3744 });
5bc10b2c 3745
40a82448 3746
3dab1dad 3747#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3748
3df15adc 3749#ifdef DEBUGGING
5bc10b2c 3750
ab3bbdeb 3751STATIC void
f2ed9b32 3752S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3753 const char *start, const char *end, const char *blurb)
3754{
efd26800 3755 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3756
3757 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3758
ab3bbdeb
YO
3759 if (!PL_colorset)
3760 reginitcolors();
3761 {
3762 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3763 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3764
f2ed9b32 3765 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3766 start, end - start, 60);
3767
3768 PerlIO_printf(Perl_debug_log,
3769 "%s%s REx%s %s against %s\n",
3770 PL_colors[4], blurb, PL_colors[5], s0, s1);
3771
f2ed9b32 3772 if (utf8_target||utf8_pat)
1de06328
YO
3773 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3774 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3775 utf8_pat && utf8_target ? " and " : "",
3776 utf8_target ? "string" : ""
ab3bbdeb
YO
3777 );
3778 }
3779}
3df15adc
YO
3780
3781STATIC void
786e8c11
YO
3782S_dump_exec_pos(pTHX_ const char *locinput,
3783 const regnode *scan,
3784 const char *loc_regeol,
3785 const char *loc_bostr,
3786 const char *loc_reg_starttry,
f2ed9b32 3787 const bool utf8_target)
07be1b83 3788{
786e8c11 3789 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3790 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3791 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3792 /* The part of the string before starttry has one color
3793 (pref0_len chars), between starttry and current
3794 position another one (pref_len - pref0_len chars),
3795 after the current position the third one.
3796 We assume that pref0_len <= pref_len, otherwise we
3797 decrease pref0_len. */
786e8c11
YO
3798 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3799 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3800 int pref0_len;
3801
7918f24d
NC
3802 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3803
f2ed9b32 3804 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3805 pref_len++;
786e8c11
YO
3806 pref0_len = pref_len - (locinput - loc_reg_starttry);
3807 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3808 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3809 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3810 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3811 l--;
3812 if (pref0_len < 0)
3813 pref0_len = 0;
3814 if (pref0_len > pref_len)
3815 pref0_len = pref_len;
3816 {
33c28ab2 3817 const int is_uni = utf8_target ? 1 : 0;
0df25f3d 3818
ab3bbdeb 3819 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3820 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3821
ab3bbdeb 3822 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3823 (locinput - pref_len + pref0_len),
1de06328 3824 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3825
ab3bbdeb 3826 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3827 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3828
1de06328 3829 const STRLEN tlen=len0+len1+len2;
3df15adc 3830 PerlIO_printf(Perl_debug_log,
ab3bbdeb 3831 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
786e8c11 3832 (IV)(locinput - loc_bostr),
07be1b83 3833 len0, s0,
07be1b83 3834 len1, s1,
07be1b83 3835 (docolor ? "" : "> <"),
07be1b83 3836 len2, s2,
f9f4320a 3837 (int)(tlen > 19 ? 0 : 19 - tlen),
07be1b83
YO
3838 "");
3839 }
3840}
3df15adc 3841
07be1b83
YO
3842#endif
3843
0a4db386
YO
3844/* reg_check_named_buff_matched()
3845 * Checks to see if a named buffer has matched. The data array of
3846 * buffer numbers corresponding to the buffer is expected to reside
3847 * in the regexp->data->data array in the slot stored in the ARG() of
3848 * node involved. Note that this routine doesn't actually care about the
3849 * name, that information is not preserved from compilation to execution.
3850 * Returns the index of the leftmost defined buffer with the given name
3851 * or 0 if non of the buffers matched.
3852 */
3853STATIC I32
dc3bf405 3854S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
7918f24d 3855{
0a4db386 3856 I32 n;
f8fc2ecf 3857 RXi_GET_DECL(rex,rexi);
ad64d0ec 3858 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3859 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3860
3861 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3862
0a4db386 3863 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
3864 if ((I32)rex->lastparen >= nums[n] &&
3865 rex->offs[nums[n]].end != -1)
0a4db386
YO
3866 {
3867 return nums[n];
3868 }
3869 }
3870 return 0;
3871}
3872
2f554ef7 3873
c74f6de9 3874static bool
984e6dd1 3875S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 3876 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 3877{
79a2a0e8
KW
3878 /* This function determines if there are one or two characters that match
3879 * the first character of the passed-in EXACTish node <text_node>, and if
3880 * so, returns them in the passed-in pointers.
c74f6de9 3881 *
79a2a0e8
KW
3882 * If it determines that no possible character in the target string can
3883 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3884 * the first character in <text_node> requires UTF-8 to represent, and the
3885 * target string isn't in UTF-8.)
c74f6de9 3886 *
79a2a0e8
KW
3887 * If there are more than two characters that could match the beginning of
3888 * <text_node>, or if more context is required to determine a match or not,
3889 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3890 *
3891 * The motiviation behind this function is to allow the caller to set up
3892 * tight loops for matching. If <text_node> is of type EXACT, there is
3893 * only one possible character that can match its first character, and so
3894 * the situation is quite simple. But things get much more complicated if
3895 * folding is involved. It may be that the first character of an EXACTFish
3896 * node doesn't participate in any possible fold, e.g., punctuation, so it
3897 * can be matched only by itself. The vast majority of characters that are
3898 * in folds match just two things, their lower and upper-case equivalents.
3899 * But not all are like that; some have multiple possible matches, or match
3900 * sequences of more than one character. This function sorts all that out.
3901 *
3902 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3903 * loop of trying to match A*, we know we can't exit where the thing
3904 * following it isn't a B. And something can't be a B unless it is the
3905 * beginning of B. By putting a quick test for that beginning in a tight
3906 * loop, we can rule out things that can't possibly be B without having to
3907 * break out of the loop, thus avoiding work. Similarly, if A is a single
3908 * character, we can make a tight loop matching A*, using the outputs of
3909 * this function.
3910 *
3911 * If the target string to match isn't in UTF-8, and there aren't
3912 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3913 * the one or two possible octets (which are characters in this situation)
3914 * that can match. In all cases, if there is only one character that can
3915 * match, *<c1p> and *<c2p> will be identical.
3916 *
3917 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3918 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3919 * can match the beginning of <text_node>. They should be declared with at
3920 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3921 * undefined what these contain.) If one or both of the buffers are
3922 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3923 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3924 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3925 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3926 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9 3927
ba44c216 3928 const bool utf8_target = reginfo->is_utf8_target;
79a2a0e8 3929
9cd990bf
JH
3930 UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3931 UV c2 = (UV)CHRTEST_NOT_A_CP_2;
79a2a0e8 3932 bool use_chrtest_void = FALSE;
aed7b151 3933 const bool is_utf8_pat = reginfo->is_utf8_pat;
79a2a0e8
KW
3934
3935 /* Used when we have both utf8 input and utf8 output, to avoid converting
3936 * to/from code points */
3937 bool utf8_has_been_setup = FALSE;
3938
c74f6de9
KW
3939 dVAR;
3940
b4291290 3941 U8 *pat = (U8*)STRING(text_node);
a6715020 3942 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
c74f6de9 3943
a4525e78 3944 if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
79a2a0e8
KW
3945
3946 /* In an exact node, only one thing can be matched, that first
3947 * character. If both the pat and the target are UTF-8, we can just
3948 * copy the input to the output, avoiding finding the code point of
3949 * that character */
984e6dd1 3950 if (!is_utf8_pat) {
79a2a0e8
KW
3951 c2 = c1 = *pat;
3952 }
3953 else if (utf8_target) {
3954 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3955 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3956 utf8_has_been_setup = TRUE;
3957 }
3958 else {
3959 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 3960 }
79a2a0e8 3961 }
31f05a37
KW
3962 else { /* an EXACTFish node */
3963 U8 *pat_end = pat + STR_LEN(text_node);
3964
3965 /* An EXACTFL node has at least some characters unfolded, because what
3966 * they match is not known until now. So, now is the time to fold
3967 * the first few of them, as many as are needed to determine 'c1' and
3968 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
3969 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3970 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
3971 * need to fold as many characters as a single character can fold to,
3972 * so that later we can check if the first ones are such a multi-char
3973 * fold. But, in such a pattern only locale-problematic characters
3974 * aren't folded, so we can skip this completely if the first character
3975 * in the node isn't one of the tricky ones */
3976 if (OP(text_node) == EXACTFL) {
3977
3978 if (! is_utf8_pat) {
3979 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3980 {
3981 folded[0] = folded[1] = 's';
3982 pat = folded;
3983 pat_end = folded + 2;
3984 }
3985 }
3986 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3987 U8 *s = pat;
3988 U8 *d = folded;
3989 int i;
3990
3991 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3992 if (isASCII(*s)) {
3993 *(d++) = (U8) toFOLD_LC(*s);
3994 s++;
3995 }
3996 else {
3997 STRLEN len;
3998 _to_utf8_fold_flags(s,
3999 d,
4000 &len,
4001 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4002 d += len;
4003 s += UTF8SKIP(s);
4004 }
4005 }
4006
4007 pat = folded;
4008 pat_end = d;
4009 }
4010 }
4011
251b239f
KW
4012 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4013 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
baa60164
KW
4014 {
4015 /* Multi-character folds require more context to sort out. Also
4016 * PL_utf8_foldclosures used below doesn't handle them, so have to
4017 * be handled outside this routine */
4018 use_chrtest_void = TRUE;
4019 }
4020 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4021 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
e8ea8356 4022 if (c1 > 255) {
baa60164
KW
4023 /* Load the folds hash, if not already done */
4024 SV** listp;
4025 if (! PL_utf8_foldclosures) {
31667e6b 4026 _load_PL_utf8_foldclosures();
79a2a0e8 4027 }
79a2a0e8 4028
baa60164
KW
4029 /* The fold closures data structure is a hash with the keys
4030 * being the UTF-8 of every character that is folded to, like
4031 * 'k', and the values each an array of all code points that
4032 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
4033 * Multi-character folds are not included */
4034 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4035 (char *) pat,
4036 UTF8SKIP(pat),
4037 FALSE))))
4038 {
4039 /* Not found in the hash, therefore there are no folds
4040 * containing it, so there is only a single character that
4041 * could match */
4042 c2 = c1;
79a2a0e8 4043 }
baa60164
KW
4044 else { /* Does participate in folds */
4045 AV* list = (AV*) *listp;
b9f2b683 4046 if (av_tindex(list) != 1) {
79a2a0e8 4047
baa60164
KW
4048 /* If there aren't exactly two folds to this, it is
4049 * outside the scope of this function */
4050 use_chrtest_void = TRUE;
79a2a0e8 4051 }
baa60164
KW
4052 else { /* There are two. Get them */
4053 SV** c_p = av_fetch(list, 0, FALSE);
4054 if (c_p == NULL) {
4055 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4056 }
4057 c1 = SvUV(*c_p);
4058
4059 c_p = av_fetch(list, 1, FALSE);
4060 if (c_p == NULL) {
4061 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4062 }
4063 c2 = SvUV(*c_p);
4064
4065 /* Folds that cross the 255/256 boundary are forbidden
4066 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4067 * one is ASCIII. Since the pattern character is above
e8ea8356 4068 * 255, and its only other match is below 256, the only
baa60164
KW
4069 * legal match will be to itself. We have thrown away
4070 * the original, so have to compute which is the one
e8ea8356 4071 * above 255. */
baa60164
KW
4072 if ((c1 < 256) != (c2 < 256)) {
4073 if ((OP(text_node) == EXACTFL
4074 && ! IN_UTF8_CTYPE_LOCALE)
4075 || ((OP(text_node) == EXACTFA
4076 || OP(text_node) == EXACTFA_NO_TRIE)
4077 && (isASCII(c1) || isASCII(c2))))
4078 {
4079 if (c1 < 256) {
4080 c1 = c2;
4081 }
4082 else {
4083 c2 = c1;
4084 }
79a2a0e8
KW
4085 }
4086 }
4087 }
4088 }
4089 }
e8ea8356 4090 else /* Here, c1 is <= 255 */
baa60164
KW
4091 if (utf8_target
4092 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4093 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4094 && ((OP(text_node) != EXACTFA
4095 && OP(text_node) != EXACTFA_NO_TRIE)
4096 || ! isASCII(c1)))
4097 {
4098 /* Here, there could be something above Latin1 in the target
4099 * which folds to this character in the pattern. All such
4100 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4101 * than two characters involved in their folds, so are outside
4102 * the scope of this function */
4103 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4104 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4105 }
4106 else {
4107 use_chrtest_void = TRUE;
4108 }
79a2a0e8 4109 }
baa60164
KW
4110 else { /* Here nothing above Latin1 can fold to the pattern
4111 character */
4112 switch (OP(text_node)) {
c74f6de9 4113
baa60164
KW
4114 case EXACTFL: /* /l rules */
4115 c2 = PL_fold_locale[c1];
4116 break;
c74f6de9 4117
baa60164
KW
4118 case EXACTF: /* This node only generated for non-utf8
4119 patterns */
4120 assert(! is_utf8_pat);
4121 if (! utf8_target) { /* /d rules */
4122 c2 = PL_fold[c1];
4123 break;
4124 }
4125 /* FALLTHROUGH */
4126 /* /u rules for all these. This happens to work for
4127 * EXACTFA as nothing in Latin1 folds to ASCII */
4128 case EXACTFA_NO_TRIE: /* This node only generated for
4129 non-utf8 patterns */
4130 assert(! is_utf8_pat);
924ba076 4131 /* FALLTHROUGH */
baa60164
KW
4132 case EXACTFA:
4133 case EXACTFU_SS:
4134 case EXACTFU:
4135 c2 = PL_fold_latin1[c1];
c74f6de9 4136 break;
c74f6de9 4137
baa60164
KW
4138 default:
4139 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
e5964223 4140 NOT_REACHED; /* NOTREACHED */
baa60164 4141 }
c74f6de9
KW
4142 }
4143 }
4144 }
79a2a0e8
KW
4145
4146 /* Here have figured things out. Set up the returns */
4147 if (use_chrtest_void) {
4148 *c2p = *c1p = CHRTEST_VOID;
4149 }
4150 else if (utf8_target) {
4151 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
4152 uvchr_to_utf8(c1_utf8, c1);
4153 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 4154 }
c74f6de9 4155
79a2a0e8
KW
4156 /* Invariants are stored in both the utf8 and byte outputs; Use
4157 * negative numbers otherwise for the byte ones. Make sure that the
4158 * byte ones are the same iff the utf8 ones are the same */
4159 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4160 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4161 ? *c2_utf8
4162 : (c1 == c2)
4163 ? CHRTEST_NOT_A_CP_1
4164 : CHRTEST_NOT_A_CP_2;
4165 }
4166 else if (c1 > 255) {
4167 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
4168 can represent */
4169 return FALSE;
4170 }
c74f6de9 4171
79a2a0e8
KW
4172 *c1p = *c2p = c2; /* c2 is the only representable value */
4173 }
4174 else { /* c1 is representable; see about c2 */
4175 *c1p = c1;
4176 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 4177 }
2f554ef7 4178
c74f6de9
KW
4179 return TRUE;
4180}
2f554ef7 4181
64935bc6
KW
4182/* This creates a single number by combining two, with 'before' being like the
4183 * 10's digit, but this isn't necessarily base 10; it is base however many
4184 * elements of the enum there are */
85e5f08b 4185#define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
64935bc6
KW
4186
4187STATIC bool
85e5f08b 4188S_isGCB(const GCB_enum before, const GCB_enum after)
64935bc6
KW
4189{
4190 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4191 * between the inputs. See http://www.unicode.org/reports/tr29/ */
4192
4193 switch (GCBcase(before, after)) {
4194
4195 /* Break at the start and end of text.
4196 GB1. sot ÷
4197 GB2. ÷ eot
4198
4199 Break before and after controls except between CR and LF
4200 GB4. ( Control | CR | LF ) ÷
4201 GB5. ÷ ( Control | CR | LF )
4202
4203 Otherwise, break everywhere.
4204 GB10. Any ÷ Any */
4205 default:
4206 return TRUE;
4207
4208 /* Do not break between a CR and LF.
4209 GB3. CR × LF */
85e5f08b 4210 case GCBcase(GCB_CR, GCB_LF):
64935bc6
KW
4211 return FALSE;
4212
4213 /* Do not break Hangul syllable sequences.
4214 GB6. L × ( L | V | LV | LVT ) */
85e5f08b
KW
4215 case GCBcase(GCB_L, GCB_L):
4216 case GCBcase(GCB_L, GCB_V):
4217 case GCBcase(GCB_L, GCB_LV):
4218 case GCBcase(GCB_L, GCB_LVT):
64935bc6
KW
4219 return FALSE;
4220
4221 /* GB7. ( LV | V ) × ( V | T ) */
85e5f08b
KW
4222 case GCBcase(GCB_LV, GCB_V):
4223 case GCBcase(GCB_LV, GCB_T):
4224 case GCBcase(GCB_V, GCB_V):
4225 case GCBcase(GCB_V, GCB_T):
64935bc6
KW
4226 return FALSE;
4227
4228 /* GB8. ( LVT | T) × T */
85e5f08b
KW
4229 case GCBcase(GCB_LVT, GCB_T):
4230 case GCBcase(GCB_T, GCB_T):
64935bc6
KW
4231 return FALSE;
4232
4233 /* Do not break between regional indicator symbols.
4234 GB8a. Regional_Indicator × Regional_Indicator */
85e5f08b 4235 case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
64935bc6
KW
4236 return FALSE;
4237
4238 /* Do not break before extending characters.
4239 GB9. × Extend */
85e5f08b
KW
4240 case GCBcase(GCB_Other, GCB_Extend):
4241 case GCBcase(GCB_Extend, GCB_Extend):
4242 case GCBcase(GCB_L, GCB_Extend):
4243 case GCBcase(GCB_LV, GCB_Extend):
4244 case GCBcase(GCB_LVT, GCB_Extend):
4245 case GCBcase(GCB_Prepend, GCB_Extend):
4246 case GCBcase(GCB_Regional_Indicator, GCB_Extend):
4247 case GCBcase(GCB_SpacingMark, GCB_Extend):
4248 case GCBcase(GCB_T, GCB_Extend):
4249 case GCBcase(GCB_V, GCB_Extend):
64935bc6
KW
4250 return FALSE;
4251
4252 /* Do not break before SpacingMarks, or after Prepend characters.
4253 GB9a. × SpacingMark */
85e5f08b
KW
4254 case GCBcase(GCB_Other, GCB_SpacingMark):
4255 case GCBcase(GCB_Extend, GCB_SpacingMark):
4256 case GCBcase(GCB_L, GCB_SpacingMark):
4257 case GCBcase(GCB_LV, GCB_SpacingMark):
4258 case GCBcase(GCB_LVT, GCB_SpacingMark):
4259 case GCBcase(GCB_Prepend, GCB_SpacingMark):
4260 case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
4261 case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
4262 case GCBcase(GCB_T, GCB_SpacingMark):
4263 case GCBcase(GCB_V, GCB_SpacingMark):
64935bc6
KW
4264 return FALSE;
4265
4266 /* GB9b. Prepend × */
85e5f08b
KW
4267 case GCBcase(GCB_Prepend, GCB_Other):
4268 case GCBcase(GCB_Prepend, GCB_L):
4269 case GCBcase(GCB_Prepend, GCB_LV):
4270 case GCBcase(GCB_Prepend, GCB_LVT):
4271 case GCBcase(GCB_Prepend, GCB_Prepend):
4272 case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
4273 case GCBcase(GCB_Prepend, GCB_T):
4274 case GCBcase(GCB_Prepend, GCB_V):
64935bc6
KW
4275 return FALSE;
4276 }
4277
661d43c4 4278 NOT_REACHED; /* NOTREACHED */
64935bc6
KW
4279}
4280
06ae2722
KW
4281#define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
4282
4283STATIC bool
85e5f08b
KW
4284S_isSB(pTHX_ SB_enum before,
4285 SB_enum after,
06ae2722
KW
4286 const U8 * const strbeg,
4287 const U8 * const curpos,
4288 const U8 * const strend,
4289 const bool utf8_target)
4290{
4291 /* returns a boolean indicating if there is a Sentence Boundary Break
4292 * between the inputs. See http://www.unicode.org/reports/tr29/ */
4293
4294 U8 * lpos = (U8 *) curpos;
4295 U8 * temp_pos;
85e5f08b 4296 SB_enum backup;
06ae2722
KW
4297
4298 PERL_ARGS_ASSERT_ISSB;
4299
4300 /* Break at the start and end of text.
4301 SB1. sot ÷
4302 SB2. ÷ eot */
85e5f08b 4303 if (before == SB_EDGE || after == SB_EDGE) {
06ae2722
KW
4304 return TRUE;
4305 }
4306
4307 /* SB 3: Do not break within CRLF. */
85e5f08b 4308 if (before == SB_CR && after == SB_LF) {
06ae2722
KW
4309 return FALSE;
4310 }
4311
4312 /* Break after paragraph separators. (though why CR and LF are considered
4313 * so is beyond me (khw)
4314 SB4. Sep | CR | LF ÷ */
85e5f08b 4315 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
06ae2722
KW
4316 return TRUE;
4317 }
4318
4319 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4320 * (See Section 6.2, Replacing Ignore Rules.)
4321 SB5. X (Extend | Format)* → X */
85e5f08b 4322 if (after == SB_Extend || after == SB_Format) {
06ae2722
KW
4323 return FALSE;
4324 }
4325
85e5f08b 4326 if (before == SB_Extend || before == SB_Format) {
06ae2722
KW
4327 before = backup_one_SB(strbeg, &lpos, utf8_target);
4328 }
4329
4330 /* Do not break after ambiguous terminators like period, if they are
4331 * immediately followed by a number or lowercase letter, if they are
4332 * between uppercase letters, if the first following letter (optionally
4333 * after certain punctuation) is lowercase, or if they are followed by
4334 * "continuation" punctuation such as comma, colon, or semicolon. For
4335 * example, a period may be an abbreviation or numeric period, and thus may
4336 * not mark the end of a sentence.
4337
4338 * SB6. ATerm × Numeric */
85e5f08b 4339 if (before == SB_ATerm && after == SB_Numeric) {
06ae2722
KW
4340 return FALSE;
4341 }
4342
d4005659 4343 /* SB7. (Upper | Lower) ATerm × Upper */
85e5f08b 4344 if (before == SB_ATerm && after == SB_Upper) {
06ae2722 4345 temp_pos = lpos;
d4005659
KW
4346 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4347 if (backup == SB_Upper || backup == SB_Lower) {
06ae2722
KW
4348 return FALSE;
4349 }
4350 }
4351
4352 /* SB8a. (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
4353 * SB10. (STerm | ATerm) Close* Sp* × ( Sp | Sep | CR | LF ) */
4354 backup = before;
4355 temp_pos = lpos;
85e5f08b 4356 while (backup == SB_Sp) {
06ae2722
KW
4357 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4358 }
85e5f08b 4359 while (backup == SB_Close) {
06ae2722
KW
4360 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4361 }
85e5f08b
KW
4362 if ((backup == SB_STerm || backup == SB_ATerm)
4363 && ( after == SB_SContinue
4364 || after == SB_STerm
4365 || after == SB_ATerm
4366 || after == SB_Sp
4367 || after == SB_Sep
4368 || after == SB_CR
4369 || after == SB_LF))
06ae2722
KW
4370 {
4371 return FALSE;
4372 }
4373
4374 /* SB8. ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
4375 * STerm | ATerm) )* Lower */
85e5f08b 4376 if (backup == SB_ATerm) {
06ae2722 4377 U8 * rpos = (U8 *) curpos;
85e5f08b
KW
4378 SB_enum later = after;
4379
4380 while ( later != SB_OLetter
4381 && later != SB_Upper
4382 && later != SB_Lower
4383 && later != SB_Sep
4384 && later != SB_CR
4385 && later != SB_LF
4386 && later != SB_STerm
4387 && later != SB_ATerm
4388 && later != SB_EDGE)
06ae2722
KW
4389 {
4390 later = advance_one_SB(&rpos, strend, utf8_target);
4391 }
85e5f08b 4392 if (later == SB_Lower) {
06ae2722
KW
4393 return FALSE;
4394 }
4395 }
4396
4397 /* Break after sentence terminators, but include closing punctuation,
4398 * trailing spaces, and a paragraph separator (if present). [See note
4399 * below.]
4400 * SB9. ( STerm | ATerm ) Close* × ( Close | Sp | Sep | CR | LF ) */
4401 backup = before;
4402 temp_pos = lpos;
85e5f08b 4403 while (backup == SB_Close) {
06ae2722
KW
4404 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4405 }
85e5f08b
KW
4406 if ((backup == SB_STerm || backup == SB_ATerm)
4407 && ( after == SB_Close
4408 || after == SB_Sp
4409 || after == SB_Sep
4410 || after == SB_CR
4411 || after == SB_LF))
06ae2722
KW
4412 {
4413 return FALSE;
4414 }
4415
4416
4417 /* SB11. ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )? ÷ */
4418 temp_pos = lpos;
4419 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
85e5f08b
KW
4420 if ( backup == SB_Sep
4421 || backup == SB_CR
4422 || backup == SB_LF)
06ae2722
KW
4423 {
4424 lpos = temp_pos;
4425 }
4426 else {
4427 backup = before;
4428 }
85e5f08b 4429 while (backup == SB_Sp) {
06ae2722
KW
4430 backup = backup_one_SB(strbeg, &lpos, utf8_target);
4431 }
85e5f08b 4432 while (backup == SB_Close) {
06ae2722
KW
4433 backup = backup_one_SB(strbeg, &lpos, utf8_target);
4434 }
85e5f08b 4435 if (backup == SB_STerm || backup == SB_ATerm) {
06ae2722
KW
4436 return TRUE;
4437 }
4438
4439 /* Otherwise, do not break.
4440 SB12. Any × Any */
4441
4442 return FALSE;
4443}
4444
85e5f08b 4445STATIC SB_enum
06ae2722
KW
4446S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4447{
85e5f08b 4448 SB_enum sb;
06ae2722
KW
4449
4450 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4451
4452 if (*curpos >= strend) {
85e5f08b 4453 return SB_EDGE;
06ae2722
KW
4454 }
4455
4456 if (utf8_target) {
4457 do {
4458 *curpos += UTF8SKIP(*curpos);
4459 if (*curpos >= strend) {
85e5f08b 4460 return SB_EDGE;
06ae2722
KW
4461 }
4462 sb = getSB_VAL_UTF8(*curpos, strend);
85e5f08b 4463 } while (sb == SB_Extend || sb == SB_Format);
06ae2722
KW
4464 }
4465 else {
4466 do {
4467 (*curpos)++;
4468 if (*curpos >= strend) {
85e5f08b 4469 return SB_EDGE;
06ae2722
KW
4470 }
4471 sb = getSB_VAL_CP(**curpos);
85e5f08b 4472 } while (sb == SB_Extend || sb == SB_Format);
06ae2722
KW
4473 }
4474
4475 return sb;
4476}
4477
85e5f08b 4478STATIC SB_enum
06ae2722
KW
4479S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4480{
85e5f08b 4481 SB_enum sb;
06ae2722
KW
4482
4483 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4484
4485 if (*curpos < strbeg) {
85e5f08b 4486 return SB_EDGE;
06ae2722
KW
4487 }
4488
4489 if (utf8_target) {
4490 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4491 if (! prev_char_pos) {
85e5f08b 4492 return SB_EDGE;
06ae2722
KW
4493 }
4494
4495 /* Back up over Extend and Format. curpos is always just to the right
4496 * of the characater whose value we are getting */
4497 do {
4498 U8 * prev_prev_char_pos;
4499 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4500 strbeg)))
4501 {
4502 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4503 *curpos = prev_char_pos;
4504 prev_char_pos = prev_prev_char_pos;
4505 }
4506 else {
4507 *curpos = (U8 *) strbeg;
85e5f08b 4508 return SB_EDGE;
06ae2722 4509 }
85e5f08b 4510 } while (sb == SB_Extend || sb == SB_Format);
06ae2722
KW
4511 }
4512 else {
4513 do {
4514 if (*curpos - 2 < strbeg) {
4515 *curpos = (U8 *) strbeg;
85e5f08b 4516 return SB_EDGE;
06ae2722
KW
4517 }
4518 (*curpos)--;
4519 sb = getSB_VAL_CP(*(*curpos - 1));
85e5f08b 4520 } while (sb == SB_Extend || sb == SB_Format);
06ae2722
KW
4521 }
4522
4523 return sb;
4524}
4525
85e5f08b 4526#define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
ae3bb8ea
KW
4527
4528STATIC bool
85e5f08b
KW
4529S_isWB(pTHX_ WB_enum previous,
4530 WB_enum before,
4531 WB_enum after,
ae3bb8ea
KW
4532 const U8 * const strbeg,
4533 const U8 * const curpos,
4534 const U8 * const strend,
4535 const bool utf8_target)
4536{
4537 /* Return a boolean as to if the boundary between 'before' and 'after' is
4538 * a Unicode word break, using their published algorithm. Context may be
4539 * needed to make this determination. If the value for the character
4540 * before 'before' is known, it is passed as 'previous'; otherwise that
85e5f08b 4541 * should be set to WB_UNKNOWN. The other input parameters give the
ae3bb8ea
KW
4542 * boundaries and current position in the matching of the string. That
4543 * is, 'curpos' marks the position where the character whose wb value is
4544 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
4545
4546 U8 * before_pos = (U8 *) curpos;
4547 U8 * after_pos = (U8 *) curpos;
4548
4549 PERL_ARGS_ASSERT_ISWB;
4550
4551 /* WB1 and WB2: Break at the start and end of text. */
85e5f08b 4552 if (before == WB_EDGE || after == WB_EDGE) {
ae3bb8ea
KW
4553 return TRUE;
4554 }
4555
4556 /* WB 3: Do not break within CRLF. */
85e5f08b 4557 if (before == WB_CR && after == WB_LF) {
ae3bb8ea
KW
4558 return FALSE;
4559 }
4560
4561 /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
4562 * and LF) */
85e5f08b
KW
4563 if ( before == WB_CR || before == WB_LF || before == WB_Newline
4564 || after == WB_CR || after == WB_LF || after == WB_Newline)
ae3bb8ea
KW
4565 {
4566 return TRUE;
4567 }
4568
4569 /* Ignore Format and Extend characters, except when they appear at the
4570 * beginning of a region of text.
4571 * WB4. X (Extend | Format)* → X. */
4572
85e5f08b 4573 if (after == WB_Extend || after == WB_Format) {
ae3bb8ea
KW
4574 return FALSE;
4575 }
4576
85e5f08b 4577 if (before == WB_Extend || before == WB_Format) {
ae3bb8ea
KW
4578 before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4579 }
4580
4581 switch (WBcase(before, after)) {
4582 /* Otherwise, break everywhere (including around ideographs).
4583 WB14. Any ÷ Any */
4584 default:
4585 return TRUE;
4586
4587 /* Do not break between most letters.
4588 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
85e5f08b
KW
4589 case WBcase(WB_ALetter, WB_ALetter):
4590 case WBcase(WB_ALetter, WB_Hebrew_Letter):
4591 case WBcase(WB_Hebrew_Letter, WB_ALetter):
4592 case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
ae3bb8ea
KW
4593 return FALSE;
4594
4595 /* Do not break letters across certain punctuation.
4596 WB6. (ALetter | Hebrew_Letter)
4597 × (MidLetter | MidNumLet | Single_Quote) (ALetter
4598 | Hebrew_Letter) */
85e5f08b
KW
4599 case WBcase(WB_ALetter, WB_MidLetter):
4600 case WBcase(WB_ALetter, WB_MidNumLet):
4601 case WBcase(WB_ALetter, WB_Single_Quote):
4602 case WBcase(WB_Hebrew_Letter, WB_MidLetter):
4603 case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
4604 /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
ae3bb8ea 4605 after = advance_one_WB(&after_pos, strend, utf8_target);
85e5f08b 4606 return after != WB_ALetter && after != WB_Hebrew_Letter;
ae3bb8ea
KW
4607
4608 /* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
4609 * Single_Quote) × (ALetter | Hebrew_Letter) */
85e5f08b
KW
4610 case WBcase(WB_MidLetter, WB_ALetter):
4611 case WBcase(WB_MidLetter, WB_Hebrew_Letter):
4612 case WBcase(WB_MidNumLet, WB_ALetter):
4613 case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
4614 case WBcase(WB_Single_Quote, WB_ALetter):
4615 case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
ae3bb8ea
KW
4616 before
4617 = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
85e5f08b 4618 return before != WB_ALetter && before != WB_Hebrew_Letter;
ae3bb8ea
KW
4619
4620 /* WB7a. Hebrew_Letter × Single_Quote */
85e5f08b 4621 case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
ae3bb8ea
KW
4622 return FALSE;
4623
4624 /* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */
85e5f08b 4625 case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
ae3bb8ea 4626 return advance_one_WB(&after_pos, strend, utf8_target)
85e5f08b 4627 != WB_Hebrew_Letter;
ae3bb8ea
KW
4628
4629 /* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */
85e5f08b 4630 case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
ae3bb8ea 4631 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
85e5f08b 4632 != WB_Hebrew_Letter;
ae3bb8ea
KW
4633
4634 /* Do not break within sequences of digits, or digits adjacent to
4635 * letters (“3a”, or “A3”).
4636 WB8. Numeric × Numeric */
85e5f08b 4637 case WBcase(WB_Numeric, WB_Numeric):
ae3bb8ea
KW
4638 return FALSE;
4639
4640 /* WB9. (ALetter | Hebrew_Letter) × Numeric */
85e5f08b
KW
4641 case WBcase(WB_ALetter, WB_Numeric):
4642 case WBcase(WB_Hebrew_Letter, WB_Numeric):
ae3bb8ea
KW
4643 return FALSE;
4644
4645 /* WB10. Numeric × (ALetter | Hebrew_Letter) */
85e5f08b
KW
4646 case WBcase(WB_Numeric, WB_ALetter):
4647 case WBcase(WB_Numeric, WB_Hebrew_Letter):
ae3bb8ea
KW
4648 return FALSE;
4649
4650 /* Do not break within sequences, such as “3.2” or “3,456.789”.
4651 WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
4652 */
85e5f08b
KW
4653 case WBcase(WB_MidNum, WB_Numeric):
4654 case WBcase(WB_MidNumLet, WB_Numeric):
4655 case WBcase(WB_Single_Quote, WB_Numeric):
ae3bb8ea 4656 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
85e5f08b 4657 != WB_Numeric;
ae3bb8ea
KW
4658
4659 /* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
4660 * */
85e5f08b
KW
4661 case WBcase(WB_Numeric, WB_MidNum):
4662 case WBcase(WB_Numeric, WB_MidNumLet):
4663 case WBcase(WB_Numeric, WB_Single_Quote):
ae3bb8ea 4664 return advance_one_WB(&after_pos, strend, utf8_target)
85e5f08b 4665 != WB_Numeric;
ae3bb8ea
KW
4666
4667 /* Do not break between Katakana.
4668 WB13. Katakana × Katakana */
85e5f08b 4669 case WBcase(WB_Katakana, WB_Katakana):
ae3bb8ea
KW
4670 return FALSE;
4671
4672 /* Do not break from extenders.
4673 WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana |
4674 ExtendNumLet) × ExtendNumLet */
85e5f08b
KW
4675 case WBcase(WB_ALetter, WB_ExtendNumLet):
4676 case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
4677 case WBcase(WB_Numeric, WB_ExtendNumLet):
4678 case WBcase(WB_Katakana, WB_ExtendNumLet):
4679 case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
ae3bb8ea
KW
4680 return FALSE;
4681
4682 /* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric
4683 * | Katakana) */
85e5f08b
KW
4684 case WBcase(WB_ExtendNumLet, WB_ALetter):
4685 case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
4686 case WBcase(WB_ExtendNumLet, WB_Numeric):
4687 case WBcase(WB_ExtendNumLet, WB_Katakana):
ae3bb8ea
KW
4688 return FALSE;
4689
4690 /* Do not break between regional indicator symbols.
4691 WB13c. Regional_Indicator × Regional_Indicator */
85e5f08b 4692 case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
ae3bb8ea
KW
4693 return FALSE;
4694
4695 }
4696
661d43c4 4697 NOT_REACHED; /* NOTREACHED */
ae3bb8ea
KW
4698}
4699
85e5f08b 4700STATIC WB_enum
ae3bb8ea
KW
4701S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4702{
85e5f08b 4703 WB_enum wb;
ae3bb8ea
KW
4704
4705 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
4706
4707 if (*curpos >= strend) {
85e5f08b 4708 return WB_EDGE;
ae3bb8ea
KW
4709 }
4710
4711 if (utf8_target) {
4712
4713 /* Advance over Extend and Format */
4714 do {
4715 *curpos += UTF8SKIP(*curpos);
4716 if (*curpos >= strend) {
85e5f08b 4717 return WB_EDGE;
ae3bb8ea
KW
4718 }
4719 wb = getWB_VAL_UTF8(*curpos, strend);
85e5f08b 4720 } while (wb == WB_Extend || wb == WB_Format);
ae3bb8ea
KW
4721 }
4722 else {
4723 do {
4724 (*curpos)++;
4725 if (*curpos >= strend) {
85e5f08b 4726 return WB_EDGE;
ae3bb8ea
KW
4727 }
4728 wb = getWB_VAL_CP(**curpos);
85e5f08b 4729 } while (wb == WB_Extend || wb == WB_Format);
ae3bb8ea
KW
4730 }
4731
4732 return wb;
4733}
4734
85e5f08b
KW
4735STATIC WB_enum
4736S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
ae3bb8ea 4737{
85e5f08b 4738 WB_enum wb;
ae3bb8ea
KW
4739
4740 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
4741
4742 /* If we know what the previous character's break value is, don't have
4743 * to look it up */
85e5f08b 4744 if (*previous != WB_UNKNOWN) {
ae3bb8ea 4745 wb = *previous;
85e5f08b 4746 *previous = WB_UNKNOWN;
ae3bb8ea
KW
4747 /* XXX Note that doesn't change curpos, and maybe should */
4748
4749 /* But we always back up over these two types */
85e5f08b 4750 if (wb != WB_Extend && wb != WB_Format) {
ae3bb8ea
KW
4751 return wb;
4752 }
4753 }
4754
4755 if (*curpos < strbeg) {
85e5f08b 4756 return WB_EDGE;
ae3bb8ea
KW
4757 }
4758
4759 if (utf8_target) {
4760 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4761 if (! prev_char_pos) {
85e5f08b 4762 return WB_EDGE;
ae3bb8ea
KW
4763 }
4764
4765 /* Back up over Extend and Format. curpos is always just to the right
4766 * of the characater whose value we are getting */
4767 do {
4768 U8 * prev_prev_char_pos;
4769 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
4770 -1,
4771 strbeg)))
4772 {
4773 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4774 *curpos = prev_char_pos;
4775 prev_char_pos = prev_prev_char_pos;
4776 }
4777 else {
4778 *curpos = (U8 *) strbeg;
85e5f08b 4779 return WB_EDGE;
ae3bb8ea 4780 }
85e5f08b 4781 } while (wb == WB_Extend || wb == WB_Format);
ae3bb8ea
KW
4782 }
4783 else {
4784 do {
4785 if (*curpos - 2 < strbeg) {
4786 *curpos = (U8 *) strbeg;
85e5f08b 4787 return WB_EDGE;
ae3bb8ea
KW
4788 }
4789 (*curpos)--;
4790 wb = getWB_VAL_CP(*(*curpos - 1));
85e5f08b 4791 } while (wb == WB_Extend || wb == WB_Format);
ae3bb8ea
KW
4792 }
4793
4794 return wb;
4795}
4796
f73aaa43 4797/* returns -1 on failure, $+[0] on success */
99a90e59 4798STATIC SSize_t
f73aaa43 4799S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 4800{
a35a87e7 4801#if PERL_VERSION < 9 && !defined(PERL_CORE)
54df2634
NC
4802 dMY_CXT;
4803#endif
27da23d5 4804 dVAR;
ba44c216 4805 const bool utf8_target = reginfo->is_utf8_target;
4ad0818d 4806 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 4807 REGEXP *rex_sv = reginfo->prog;
8d919b0a 4808 regexp *rex = ReANY(rex_sv);
f8fc2ecf 4809 RXi_GET_DECL(rex,rexi);
5d9a96ca 4810 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 4811 regmatch_state *st;
5d9a96ca 4812 /* cache heavy used fields of st in registers */
eb578fdb
KW
4813 regnode *scan;
4814 regnode *next;
4815 U32 n = 0; /* general value; init to avoid compiler warning */
ea3daa5d 4816 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
d60de1d1 4817 char *locinput = startpos;
4d5016e5 4818 char *pushinput; /* where to continue after a PUSH */
eb578fdb 4819 I32 nextchr; /* is always set to UCHARAT(locinput) */
24d3c4a9 4820
b69b0499 4821 bool result = 0; /* return value of S_regmatch */
24d3c4a9 4822 int depth = 0; /* depth of backtrack stack */
4b196cd4
YO
4823 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
4824 const U32 max_nochange_depth =
4825 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
4826 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
4827 regmatch_state *yes_state = NULL; /* state to pop to on success of
4828 subpattern */
e2e6a0f1
YO
4829 /* mark_state piggy backs on the yes_state logic so that when we unwind
4830 the stack on success we can update the mark_state as we go */
4831 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 4832 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 4833 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 4834 U32 state_num;
5d458dd8
YO
4835 bool no_final = 0; /* prevent failure from backtracking? */
4836 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 4837 char *startpoint = locinput;
5d458dd8
YO
4838 SV *popmark = NULL; /* are we looking for a mark? */
4839 SV *sv_commit = NULL; /* last mark name seen in failure */
4840 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 4841 during a successful match */
5d458dd8
YO
4842 U32 lastopen = 0; /* last open we saw */
4843 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4a2b275c 4844 SV* const oreplsv = GvSVn(PL_replgv);
24d3c4a9
DM
4845 /* these three flags are set by various ops to signal information to
4846 * the very next op. They have a useful lifetime of exactly one loop
4847 * iteration, and are not preserved or restored by state pushes/pops
4848 */
4849 bool sw = 0; /* the condition value in (?(cond)a|b) */
4850 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
4851 int logical = 0; /* the following EVAL is:
4852 0: (?{...})
4853 1: (?(?{...})X|Y)
4854 2: (??{...})
4855 or the following IFMATCH/UNLESSM is:
4856 false: plain (?=foo)
4857 true: used as a condition: (?(?=foo))
4858 */
81ed78b2
DM
4859 PAD* last_pad = NULL;
4860 dMULTICALL;
4861 I32 gimme = G_SCALAR;
4862 CV *caller_cv = NULL; /* who called us */
4863 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
74088413 4864 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
92da3157 4865 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
4866 int to_complement; /* Invert the result? */
4867 _char_class_number classnum;
984e6dd1 4868 bool is_utf8_pat = reginfo->is_utf8_pat;
64935bc6
KW
4869 bool match = FALSE;
4870
81ed78b2 4871
95b24440 4872#ifdef DEBUGGING
e68ec53f 4873 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
4874#endif
4875
7e68f152
FC
4876 /* protect against undef(*^R) */
4877 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
4878
81ed78b2
DM
4879 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
4880 multicall_oldcatch = 0;
4881 multicall_cv = NULL;
4882 cx = NULL;
4f8dbb2d
JL
4883 PERL_UNUSED_VAR(multicall_cop);
4884 PERL_UNUSED_VAR(newsp);
81ed78b2
DM
4885
4886
7918f24d
NC
4887 PERL_ARGS_ASSERT_REGMATCH;
4888
3b57cd43 4889 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
24b23f37 4890 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3b57cd43 4891 }));
5d9a96ca 4892
331b2dcc 4893 st = PL_regmatch_state;
5d9a96ca 4894
d6a28714 4895 /* Note that nextchr is a byte even in UTF */
7016d6eb 4896 SET_nextchr;
d6a28714
JH
4897 scan = prog;
4898 while (scan != NULL) {
8ba1375e 4899
a3621e74 4900 DEBUG_EXECUTE_r( {
6136c704 4901 SV * const prop = sv_newmortal();
1de06328 4902 regnode *rnext=regnext(scan);
f2ed9b32 4903 DUMP_EXEC_POS( locinput, scan, utf8_target );
8b9781c9 4904 regprop(rex, prop, scan, reginfo, NULL);
07be1b83
YO
4905
4906 PerlIO_printf(Perl_debug_log,
4907 "%3"IVdf":%*s%s(%"IVdf")\n",
f8fc2ecf 4908 (IV)(scan - rexi->program), depth*2, "",
07be1b83 4909 SvPVX_const(prop),
1de06328 4910 (PL_regkind[OP(scan)] == END || !rnext) ?
f8fc2ecf 4911 0 : (IV)(rnext - rexi->program));
2a782b5b 4912 });
d6a28714
JH
4913
4914 next = scan + NEXT_OFF(scan);
4915 if (next == scan)
4916 next = NULL;
40a82448 4917 state_num = OP(scan);
d6a28714 4918
40a82448 4919 reenter_switch:
3018b823 4920 to_complement = 0;
34a81e2b 4921
7016d6eb 4922 SET_nextchr;
e6ca698c 4923 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 4924
40a82448 4925 switch (state_num) {
d3d47aac 4926 case SBOL: /* /^../ and /\A../ */
9d9163fb 4927 if (locinput == reginfo->strbeg)
b8c5462f 4928 break;
d6a28714 4929 sayNO;
3c0563b9
DM
4930
4931 case MBOL: /* /^../m */
9d9163fb 4932 if (locinput == reginfo->strbeg ||
7016d6eb 4933 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 4934 {
b8c5462f
JH
4935 break;
4936 }
d6a28714 4937 sayNO;
3c0563b9 4938
3c0563b9 4939 case GPOS: /* \G */
3b0527fe 4940 if (locinput == reginfo->ganch)
d6a28714
JH
4941 break;
4942 sayNO;
ee9b8eae 4943
3c0563b9 4944 case KEEPS: /* \K */
ee9b8eae 4945 /* update the startpoint */
b93070ed 4946 st->u.keeper.val = rex->offs[0].start;
9d9163fb 4947 rex->offs[0].start = locinput - reginfo->strbeg;
4d5016e5 4948 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
a74ff37d 4949 /* NOTREACHED */
661d43c4 4950 NOT_REACHED; /* NOTREACHED */
a74ff37d 4951
ee9b8eae
YO
4952 case KEEPS_next_fail:
4953 /* rollback the start point change */
b93070ed 4954 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 4955 sayNO_SILENT;
a74ff37d 4956 /* NOTREACHED */
661d43c4 4957 NOT_REACHED; /* NOTREACHED */
3c0563b9 4958
3c0563b9 4959 case MEOL: /* /..$/m */
7016d6eb 4960 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 4961 sayNO;
b8c5462f 4962 break;
3c0563b9 4963
d3d47aac 4964 case SEOL: /* /..$/ */
7016d6eb 4965 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 4966 sayNO;
220db18a 4967 if (reginfo->strend - locinput > 1)
b8c5462f 4968 sayNO;
b8c5462f 4969 break;
3c0563b9
DM
4970
4971 case EOS: /* \z */
7016d6eb 4972 if (!NEXTCHR_IS_EOS)
b8c5462f 4973 sayNO;
d6a28714 4974 break;
3c0563b9
DM
4975
4976 case SANY: /* /./s */
7016d6eb 4977 if (NEXTCHR_IS_EOS)
4633a7c4 4978 sayNO;
28b98f76 4979 goto increment_locinput;
3c0563b9 4980
3c0563b9 4981 case REG_ANY: /* /./ */
7016d6eb 4982 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 4983 sayNO;
28b98f76
DM
4984 goto increment_locinput;
4985
166ba7cd
DM
4986
4987#undef ST
4988#define ST st->u.trie
3c0563b9 4989 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
4990 /* In this case the charclass data is available inline so
4991 we can fail fast without a lot of extra overhead.
4992 */
7016d6eb 4993 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b
YO
4994 DEBUG_EXECUTE_r(
4995 PerlIO_printf(Perl_debug_log,
4996 "%*s %sfailed to match trie start class...%s\n",
4997 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4998 );
4999 sayNO_SILENT;
a74ff37d 5000 /* NOTREACHED */
661d43c4 5001 NOT_REACHED; /* NOTREACHED */
786e8c11 5002 }
924ba076 5003 /* FALLTHROUGH */
3c0563b9 5004 case TRIE: /* (ab|cd) */
2e64971a
DM
5005 /* the basic plan of execution of the trie is:
5006 * At the beginning, run though all the states, and
5007 * find the longest-matching word. Also remember the position
5008 * of the shortest matching word. For example, this pattern:
5009 * 1 2 3 4 5
5010 * ab|a|x|abcd|abc
5011 * when matched against the string "abcde", will generate
5012 * accept states for all words except 3, with the longest
895cc420 5013 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
5014 * the position being after char 1 of the string).
5015 *
5016 * Then for each matching word, in word order (i.e. 1,2,4,5),
5017 * we run the remainder of the pattern; on each try setting
5018 * the current position to the character following the word,
5019 * returning to try the next word on failure.
5020 *
5021 * We avoid having to build a list of words at runtime by
5022 * using a compile-time structure, wordinfo[].prev, which
5023 * gives, for each word, the previous accepting word (if any).
5024 * In the case above it would contain the mappings 1->2, 2->0,
5025 * 3->0, 4->5, 5->1. We can use this table to generate, from
5026 * the longest word (4 above), a list of all words, by
5027 * following the list of prev pointers; this gives us the
5028 * unordered list 4,5,1,2. Then given the current word we have
5029 * just tried, we can go through the list and find the
5030 * next-biggest word to try (so if we just failed on word 2,
5031 * the next in the list is 4).
5032 *
5033 * Since at runtime we don't record the matching position in
5034 * the string for each word, we have to work that out for
5035 * each word we're about to process. The wordinfo table holds
5036 * the character length of each word; given that we recorded
5037 * at the start: the position of the shortest word and its
5038 * length in chars, we just need to move the pointer the
5039 * difference between the two char lengths. Depending on
5040 * Unicode status and folding, that's cheap or expensive.
5041 *
5042 * This algorithm is optimised for the case where are only a
5043 * small number of accept states, i.e. 0,1, or maybe 2.
5044 * With lots of accepts states, and having to try all of them,
5045 * it becomes quadratic on number of accept states to find all
5046 * the next words.
5047 */
5048
3dab1dad 5049 {
07be1b83 5050 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 5051 DECL_TRIE_TYPE(scan);
3dab1dad
YO
5052
5053 /* what trie are we using right now */
be8e71aa 5054 reg_trie_data * const trie
f8fc2ecf 5055 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 5056 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 5057 U32 state = trie->startstate;
166ba7cd 5058
780fcc9f
KW
5059 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5060 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
613abc6d
KW
5061 if (utf8_target
5062 && UTF8_IS_ABOVE_LATIN1(nextchr)
5063 && scan->flags == EXACTL)
5064 {
5065 /* We only output for EXACTL, as we let the folder
5066 * output this message for EXACTFLU8 to avoid
5067 * duplication */
5068 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5069 reginfo->strend);
5070 }
780fcc9f 5071 }
7016d6eb
DM
5072 if ( trie->bitmap
5073 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5074 {
3dab1dad
YO
5075 if (trie->states[ state ].wordnum) {
5076 DEBUG_EXECUTE_r(
5077 PerlIO_printf(Perl_debug_log,
5078 "%*s %smatched empty string...%s\n",
5bc10b2c 5079 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad 5080 );
20dbff7c
YO
5081 if (!trie->jump)
5082 break;
3dab1dad
YO
5083 } else {
5084 DEBUG_EXECUTE_r(
5085 PerlIO_printf(Perl_debug_log,
786e8c11 5086 "%*s %sfailed to match trie start class...%s\n",
5bc10b2c 5087 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3dab1dad
YO
5088 );
5089 sayNO_SILENT;
5090 }
5091 }
166ba7cd 5092
786e8c11
YO
5093 {
5094 U8 *uc = ( U8* )locinput;
5095
5096 STRLEN len = 0;
5097 STRLEN foldlen = 0;
5098 U8 *uscan = (U8*)NULL;
786e8c11 5099 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
5100 U32 charcount = 0; /* how many input chars we have matched */
5101 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 5102
786e8c11 5103 ST.jump = trie->jump;
786e8c11 5104 ST.me = scan;
2e64971a
DM
5105 ST.firstpos = NULL;
5106 ST.longfold = FALSE; /* char longer if folded => it's harder */
5107 ST.nextword = 0;
5108
5109 /* fully traverse the TRIE; note the position of the
5110 shortest accept state and the wordnum of the longest
5111 accept state */
07be1b83 5112
220db18a 5113 while ( state && uc <= (U8*)(reginfo->strend) ) {
786e8c11 5114 U32 base = trie->states[ state ].trans.base;
f9f4320a 5115 UV uvc = 0;
acb909b4 5116 U16 charid = 0;
2e64971a
DM
5117 U16 wordnum;
5118 wordnum = trie->states[ state ].wordnum;
5119
5120 if (wordnum) { /* it's an accept state */
5121 if (!accepted) {
5122 accepted = 1;
5123 /* record first match position */
5124 if (ST.longfold) {
5125 ST.firstpos = (U8*)locinput;
5126 ST.firstchars = 0;
5b47454d 5127 }
2e64971a
DM
5128 else {
5129 ST.firstpos = uc;
5130 ST.firstchars = charcount;
5131 }
5132 }
5133 if (!ST.nextword || wordnum < ST.nextword)
5134 ST.nextword = wordnum;
5135 ST.topword = wordnum;
786e8c11 5136 }
a3621e74 5137
07be1b83 5138 DEBUG_TRIE_EXECUTE_r({
f2ed9b32 5139 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
a3621e74 5140 PerlIO_printf( Perl_debug_log,
2e64971a 5141 "%*s %sState: %4"UVxf" Accepted: %c ",
5bc10b2c 5142 2+depth * 2, "", PL_colors[4],
2e64971a 5143 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 5144 });
a3621e74 5145
2e64971a 5146 /* read a char and goto next state */
220db18a 5147 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6dd2be57 5148 I32 offset;
55eed653
NC
5149 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5150 uscan, len, uvc, charid, foldlen,
5151 foldbuf, uniflags);
2e64971a
DM
5152 charcount++;
5153 if (foldlen>0)
5154 ST.longfold = TRUE;
5b47454d 5155 if (charid &&
6dd2be57
DM
5156 ( ((offset =
5157 base + charid - 1 - trie->uniquecharcount)) >= 0)
5158
5159 && ((U32)offset < trie->lasttrans)
5160 && trie->trans[offset].check == state)
5b47454d 5161 {
6dd2be57 5162 state = trie->trans[offset].next;
5b47454d
DM
5163 }
5164 else {
5165 state = 0;
5166 }
5167 uc += len;
5168
5169 }
5170 else {
a3621e74
YO
5171 state = 0;
5172 }
5173 DEBUG_TRIE_EXECUTE_r(
e4584336 5174 PerlIO_printf( Perl_debug_log,
786e8c11 5175 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
e4584336 5176 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
5177 );
5178 }
2e64971a 5179 if (!accepted)
a3621e74 5180 sayNO;
a3621e74 5181
2e64971a
DM
5182 /* calculate total number of accept states */
5183 {
5184 U16 w = ST.topword;
5185 accepted = 0;
5186 while (w) {
5187 w = trie->wordinfo[w].prev;
5188 accepted++;
5189 }
5190 ST.accepted = accepted;
5191 }
5192
166ba7cd
DM
5193 DEBUG_EXECUTE_r(
5194 PerlIO_printf( Perl_debug_log,
5195 "%*s %sgot %"IVdf" possible matches%s\n",
5bc10b2c 5196 REPORT_CODE_OFF + depth * 2, "",
166ba7cd
DM
5197 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5198 );
2e64971a 5199 goto trie_first_try; /* jump into the fail handler */
786e8c11 5200 }}
a74ff37d 5201 /* NOTREACHED */
661d43c4 5202 NOT_REACHED; /* NOTREACHED */
2e64971a
DM
5203
5204 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
5205 {
5206 U8 *uc;
fae667d5
YO
5207 if ( ST.jump) {
5208 REGCP_UNWIND(ST.cp);
a8d1f4b4 5209 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 5210 }
2e64971a
DM
5211 if (!--ST.accepted) {
5212 DEBUG_EXECUTE_r({
5213 PerlIO_printf( Perl_debug_log,
5214 "%*s %sTRIE failed...%s\n",
5215 REPORT_CODE_OFF+depth*2, "",
5216 PL_colors[4],
5217 PL_colors[5] );
5218 });
5219 sayNO_SILENT;
5220 }
5221 {
5222 /* Find next-highest word to process. Note that this code
5223 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
5224 U16 min = 0;
5225 U16 word;
5226 U16 const nextword = ST.nextword;
5227 reg_trie_wordinfo * const wordinfo
2e64971a
DM
5228 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5229 for (word=ST.topword; word; word=wordinfo[word].prev) {
5230 if (word > nextword && (!min || word < min))
5231 min = word;
5232 }
5233 ST.nextword = min;
5234 }
5235
fae667d5 5236 trie_first_try:
5d458dd8
YO
5237 if (do_cutgroup) {
5238 do_cutgroup = 0;
5239 no_final = 0;
5240 }
fae667d5
YO
5241
5242 if ( ST.jump) {
b93070ed 5243 ST.lastparen = rex->lastparen;
f6033a9d 5244 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 5245 REGCP_SET(ST.cp);
2e64971a 5246 }
a3621e74 5247
2e64971a 5248 /* find start char of end of current word */
166ba7cd 5249 {
2e64971a 5250 U32 chars; /* how many chars to skip */
2e64971a
DM
5251 reg_trie_data * const trie
5252 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5253
5254 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5255 >= ST.firstchars);
5256 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5257 - ST.firstchars;
a059a757 5258 uc = ST.firstpos;
2e64971a
DM
5259
5260 if (ST.longfold) {
5261 /* the hard option - fold each char in turn and find
5262 * its folded length (which may be different */
5263 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5264 STRLEN foldlen;
5265 STRLEN len;
d9a396a3 5266 UV uvc;
2e64971a
DM
5267 U8 *uscan;
5268
5269 while (chars) {
f2ed9b32 5270 if (utf8_target) {
c80e42f3 5271 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
2e64971a
DM
5272 uniflags);
5273 uc += len;
5274 }
5275 else {
5276 uvc = *uc;
5277 uc++;
5278 }
5279 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5280 uscan = foldbuf;
5281 while (foldlen) {
5282 if (!--chars)
5283 break;
c80e42f3 5284 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
2e64971a
DM
5285 uniflags);
5286 uscan += len;
5287 foldlen -= len;
5288 }
5289 }
a3621e74 5290 }
2e64971a 5291 else {
f2ed9b32 5292 if (utf8_target)
2e64971a
DM
5293 while (chars--)
5294 uc += UTF8SKIP(uc);
5295 else
5296 uc += chars;
5297 }
2e64971a 5298 }
166ba7cd 5299
6603fe3e
DM
5300 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5301 ? ST.jump[ST.nextword]
5302 : NEXT_OFF(ST.me));
166ba7cd 5303
2e64971a
DM
5304 DEBUG_EXECUTE_r({
5305 PerlIO_printf( Perl_debug_log,
5306 "%*s %sTRIE matched word #%d, continuing%s\n",
5307 REPORT_CODE_OFF+depth*2, "",
5308 PL_colors[4],
5309 ST.nextword,
5310 PL_colors[5]
5311 );
5312 });
5313
5314 if (ST.accepted > 1 || has_cutgroup) {
a059a757 5315 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
a74ff37d 5316 /* NOTREACHED */
661d43c4 5317 NOT_REACHED; /* NOTREACHED */
166ba7cd 5318 }
2e64971a
DM
5319 /* only one choice left - just continue */
5320 DEBUG_EXECUTE_r({
5321 AV *const trie_words
5322 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
d0bec203
HS
5323 SV ** const tmp = trie_words
5324 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
2e64971a
DM
5325 SV *sv= tmp ? sv_newmortal() : NULL;
5326
5327 PerlIO_printf( Perl_debug_log,
5328 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
5329 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
5330 ST.nextword,
5331 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5332 PL_colors[0], PL_colors[1],
c89df6cf 5333 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
5334 )
5335 : "not compiled under -Dr",
5336 PL_colors[5] );
5337 });
5338
a059a757 5339 locinput = (char*)uc;
2e64971a 5340 continue; /* execute rest of RE */
a74ff37d 5341 /* NOTREACHED */
a059a757 5342 }
166ba7cd
DM
5343#undef ST
5344
a4525e78 5345 case EXACTL: /* /abc/l */
780fcc9f 5346 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
613abc6d
KW
5347
5348 /* Complete checking would involve going through every character
5349 * matched by the string to see if any is above latin1. But the
5350 * comparision otherwise might very well be a fast assembly
5351 * language routine, and I (khw) don't think slowing things down
5352 * just to check for this warning is worth it. So this just checks
5353 * the first character */
5354 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5355 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5356 }
780fcc9f 5357 /* FALLTHROUGH */
3c0563b9 5358 case EXACT: { /* /abc/ */
95b24440 5359 char *s = STRING(scan);
24d3c4a9 5360 ln = STR_LEN(scan);
984e6dd1 5361 if (utf8_target != is_utf8_pat) {
bc517b45 5362 /* The target and the pattern have differing utf8ness. */
1aa99e6b 5363 char *l = locinput;
24d3c4a9 5364 const char * const e = s + ln;
a72c7584 5365
f2ed9b32 5366 if (utf8_target) {
e6a3850e
KW
5367 /* The target is utf8, the pattern is not utf8.
5368 * Above-Latin1 code points can't match the pattern;
5369 * invariants match exactly, and the other Latin1 ones need
5370 * to be downgraded to a single byte in order to do the
5371 * comparison. (If we could be confident that the target
5372 * is not malformed, this could be refactored to have fewer
5373 * tests by just assuming that if the first bytes match, it
5374 * is an invariant, but there are tests in the test suite
5375 * dealing with (??{...}) which violate this) */
1aa99e6b 5376 while (s < e) {
220db18a
DM
5377 if (l >= reginfo->strend
5378 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5379 {
e6a3850e
KW
5380 sayNO;
5381 }
5382 if (UTF8_IS_INVARIANT(*(U8*)l)) {
5383 if (*l != *s) {
5384 sayNO;
5385 }
5386 l++;
5387 }
5388 else {
94bb8c36
KW
5389 if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5390 {
e6a3850e
KW
5391 sayNO;
5392 }
5393 l += 2;
5394 }
5395 s++;
1aa99e6b 5396 }
5ff6fc6d
JH
5397 }
5398 else {
5399 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 5400 while (s < e) {
220db18a
DM
5401 if (l >= reginfo->strend
5402 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
e6a3850e
KW
5403 {
5404 sayNO;
5405 }
5406 if (UTF8_IS_INVARIANT(*(U8*)s)) {
5407 if (*s != *l) {
5408 sayNO;
5409 }
5410 s++;
5411 }
5412 else {
94bb8c36
KW
5413 if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5414 {
e6a3850e
KW
5415 sayNO;
5416 }
5417 s += 2;
5418 }
5419 l++;
1aa99e6b 5420 }
5ff6fc6d 5421 }
1aa99e6b 5422 locinput = l;
1aa99e6b 5423 }
5ac65bff
KW
5424 else {
5425 /* The target and the pattern have the same utf8ness. */
5426 /* Inline the first character, for speed. */
220db18a 5427 if (reginfo->strend - locinput < ln
5ac65bff
KW
5428 || UCHARAT(s) != nextchr
5429 || (ln > 1 && memNE(s, locinput, ln)))
5430 {
5431 sayNO;
5432 }
5433 locinput += ln;
5434 }
d6a28714 5435 break;
95b24440 5436 }
7016d6eb 5437
3c0563b9 5438 case EXACTFL: { /* /abc/il */
a932d541 5439 re_fold_t folder;
9a5a5549
KW
5440 const U8 * fold_array;
5441 const char * s;
d513472c 5442 U32 fold_utf8_flags;
9a5a5549 5443
780fcc9f 5444 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
f67f9e53
KW
5445 folder = foldEQ_locale;
5446 fold_array = PL_fold_locale;
cea315b6 5447 fold_utf8_flags = FOLDEQ_LOCALE;
9a5a5549
KW
5448 goto do_exactf;
5449
a4525e78
KW
5450 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
5451 is effectively /u; hence to match, target
5452 must be UTF-8. */
5453 if (! utf8_target) {
5454 sayNO;
5455 }
613abc6d
KW
5456 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5457 | FOLDEQ_S1_FOLDS_SANE;
ec5e0e1c
KW
5458 folder = foldEQ_latin1;
5459 fold_array = PL_fold_latin1;
a4525e78
KW
5460 goto do_exactf;
5461
3c0563b9 5462 case EXACTFU_SS: /* /\x{df}/iu */
3c0563b9 5463 case EXACTFU: /* /abc/iu */
9a5a5549
KW
5464 folder = foldEQ_latin1;
5465 fold_array = PL_fold_latin1;
984e6dd1 5466 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
5467 goto do_exactf;
5468
098b07d5
KW
5469 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
5470 patterns */
5471 assert(! is_utf8_pat);
924ba076 5472 /* FALLTHROUGH */
3c0563b9 5473 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
5474 folder = foldEQ_latin1;
5475 fold_array = PL_fold_latin1;
57014d77 5476 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
5477 goto do_exactf;
5478
2fdb7295
KW
5479 case EXACTF: /* /abc/i This node only generated for
5480 non-utf8 patterns */
5481 assert(! is_utf8_pat);
9a5a5549
KW
5482 folder = foldEQ;
5483 fold_array = PL_fold;
62bf7766 5484 fold_utf8_flags = 0;
9a5a5549
KW
5485
5486 do_exactf:
5487 s = STRING(scan);
24d3c4a9 5488 ln = STR_LEN(scan);
d6a28714 5489
31f05a37
KW
5490 if (utf8_target
5491 || is_utf8_pat
5492 || state_num == EXACTFU_SS
5493 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
5494 {
3c760661
KW
5495 /* Either target or the pattern are utf8, or has the issue where
5496 * the fold lengths may differ. */
be8e71aa 5497 const char * const l = locinput;
220db18a 5498 char *e = reginfo->strend;
bc517b45 5499
984e6dd1 5500 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 5501 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
5502 {
5503 sayNO;
5486206c 5504 }
d07ddd77 5505 locinput = e;
d07ddd77 5506 break;
a0ed51b3 5507 }
d6a28714 5508
0a138b74 5509 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
5510 if (UCHARAT(s) != nextchr
5511 && !NEXTCHR_IS_EOS
5512 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 5513 {
a0ed51b3 5514 sayNO;
9a5a5549 5515 }
220db18a 5516 if (reginfo->strend - locinput < ln)
b8c5462f 5517 sayNO;
9a5a5549 5518 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 5519 sayNO;
24d3c4a9 5520 locinput += ln;
a0d0e21e 5521 break;
9a5a5549 5522 }
63ac0dad 5523
3c0563b9 5524 case NBOUNDL: /* /\B/l */
5c388b33
KW
5525 to_complement = 1;
5526 /* FALLTHROUGH */
5527
5528 case BOUNDL: /* /\b/l */
780fcc9f 5529 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6
KW
5530
5531 if (FLAGS(scan) != TRADITIONAL_BOUND) {
5532 if (! IN_UTF8_CTYPE_LOCALE) {
5533 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
5534 B_ON_NON_UTF8_LOCALE_IS_WRONG);
5535 }
5536 goto boundu;
5537 }
5538
5c388b33
KW
5539 if (utf8_target) {
5540 if (locinput == reginfo->strbeg)
5541 ln = isWORDCHAR_LC('\n');
5542 else {
5543 ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
5544 (U8*)(reginfo->strbeg)));
5545 }
5546 n = (NEXTCHR_IS_EOS)
5547 ? isWORDCHAR_LC('\n')
5548 : isWORDCHAR_LC_utf8((U8*)locinput);
5549 }
5550 else { /* Here the string isn't utf8 */
5551 ln = (locinput == reginfo->strbeg)
5552 ? isWORDCHAR_LC('\n')
5553 : isWORDCHAR_LC(UCHARAT(locinput - 1));
5554 n = (NEXTCHR_IS_EOS)
5555 ? isWORDCHAR_LC('\n')
5556 : isWORDCHAR_LC(nextchr);
5557 }
5558 if (to_complement ^ (ln == n)) {
5559 sayNO;
5560 }
5561 break;
5562
5563 case NBOUND: /* /\B/ */
5564 to_complement = 1;
780fcc9f 5565 /* FALLTHROUGH */
5c388b33 5566
3c0563b9 5567 case BOUND: /* /\b/ */
5c388b33
KW
5568 if (utf8_target) {
5569 goto bound_utf8;
5570 }
5571 goto bound_ascii_match_only;
5572
5573 case NBOUNDA: /* /\B/a */
5574 to_complement = 1;
5575 /* FALLTHROUGH */
5576
3c0563b9 5577 case BOUNDA: /* /\b/a */
5c388b33 5578
c52b8b12 5579 bound_ascii_match_only:
5c388b33
KW
5580 /* Here the string isn't utf8, or is utf8 and only ascii characters
5581 * are to match \w. In the latter case looking at the byte just
5582 * prior to the current one may be just the final byte of a
5583 * multi-byte character. This is ok. There are two cases:
5584 * 1) it is a single byte character, and then the test is doing
5585 * just what it's supposed to.
5586 * 2) it is a multi-byte character, in which case the final byte is
5587 * never mistakable for ASCII, and so the test will say it is
5588 * not a word character, which is the correct answer. */
5589 ln = (locinput == reginfo->strbeg)
5590 ? isWORDCHAR_A('\n')
5591 : isWORDCHAR_A(UCHARAT(locinput - 1));
5592 n = (NEXTCHR_IS_EOS)
5593 ? isWORDCHAR_A('\n')
5594 : isWORDCHAR_A(nextchr);
5595 if (to_complement ^ (ln == n)) {
5596 sayNO;
5597 }
5598 break;
5599
3c0563b9 5600 case NBOUNDU: /* /\B/u */
5c388b33
KW
5601 to_complement = 1;
5602 /* FALLTHROUGH */
b2680017 5603
5c388b33 5604 case BOUNDU: /* /\b/u */
64935bc6
KW
5605
5606 boundu:
5c388b33
KW
5607 if (utf8_target) {
5608
64935bc6
KW
5609 bound_utf8:
5610 switch((bound_type) FLAGS(scan)) {
5611 case TRADITIONAL_BOUND:
aa383448 5612 ln = (locinput == reginfo->strbeg)
996de84d 5613 ? 0 /* isWORDCHAR_L1('\n') */
aa383448
KW
5614 : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
5615 (U8*)(reginfo->strbeg)));
5616 n = (NEXTCHR_IS_EOS)
996de84d 5617 ? 0 /* isWORDCHAR_L1('\n') */
aa383448 5618 : isWORDCHAR_utf8((U8*)locinput);
b53eee5d 5619 match = cBOOL(ln != n);
64935bc6
KW
5620 break;
5621 case GCB_BOUND:
5622 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5623 match = TRUE; /* GCB always matches at begin and
5624 end */
5625 }
5626 else {
5627 /* Find the gcb values of previous and current
5628 * chars, then see if is a break point */
5629 match = isGCB(getGCB_VAL_UTF8(
5630 reghop3((U8*)locinput,
5631 -1,
5632 (U8*)(reginfo->strbeg)),
5633 (U8*) reginfo->strend),
5634 getGCB_VAL_UTF8((U8*) locinput,
5635 (U8*) reginfo->strend));
5636 }
5637 break;
06ae2722
KW
5638
5639 case SB_BOUND: /* Always matches at begin and end */
5640 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5641 match = TRUE;
5642 }
5643 else {
5644 match = isSB(getSB_VAL_UTF8(
5645 reghop3((U8*)locinput,
5646 -1,
5647 (U8*)(reginfo->strbeg)),
5648 (U8*) reginfo->strend),
5649 getSB_VAL_UTF8((U8*) locinput,
5650 (U8*) reginfo->strend),
5651 (U8*) reginfo->strbeg,
5652 (U8*) locinput,
5653 (U8*) reginfo->strend,
5654 utf8_target);
5655 }
5656 break;
5657
ae3bb8ea
KW
5658 case WB_BOUND:
5659 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5660 match = TRUE;
5661 }
5662 else {
85e5f08b 5663 match = isWB(WB_UNKNOWN,
ae3bb8ea
KW
5664 getWB_VAL_UTF8(
5665 reghop3((U8*)locinput,
5666 -1,
5667 (U8*)(reginfo->strbeg)),
5668 (U8*) reginfo->strend),
5669 getWB_VAL_UTF8((U8*) locinput,
5670 (U8*) reginfo->strend),
5671 (U8*) reginfo->strbeg,
5672 (U8*) locinput,
5673 (U8*) reginfo->strend,
5674 utf8_target);
5675 }
5676 break;
64935bc6 5677 }
b2680017 5678 }
64935bc6
KW
5679 else { /* Not utf8 target */
5680 switch((bound_type) FLAGS(scan)) {
5681 case TRADITIONAL_BOUND:
aa383448 5682 ln = (locinput == reginfo->strbeg)
996de84d 5683 ? 0 /* isWORDCHAR_L1('\n') */
aa383448
KW
5684 : isWORDCHAR_L1(UCHARAT(locinput - 1));
5685 n = (NEXTCHR_IS_EOS)
996de84d 5686 ? 0 /* isWORDCHAR_L1('\n') */
aa383448 5687 : isWORDCHAR_L1(nextchr);
b53eee5d 5688 match = cBOOL(ln != n);
64935bc6 5689 break;
cfaf538b 5690
64935bc6
KW
5691 case GCB_BOUND:
5692 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5693 match = TRUE; /* GCB always matches at begin and
5694 end */
5695 }
5696 else { /* Only CR-LF combo isn't a GCB in 0-255
5697 range */
5698 match = UCHARAT(locinput - 1) != '\r'
5699 || UCHARAT(locinput) != '\n';
5700 }
5701 break;
06ae2722
KW
5702
5703 case SB_BOUND: /* Always matches at begin and end */
5704 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5705 match = TRUE;
5706 }
5707 else {
5708 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
5709 getSB_VAL_CP(UCHARAT(locinput)),
5710 (U8*) reginfo->strbeg,
5711 (U8*) locinput,
5712 (U8*) reginfo->strend,
5713 utf8_target);
5714 }
5715 break;
5716
ae3bb8ea
KW
5717 case WB_BOUND:
5718 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5719 match = TRUE;
5720 }
5721 else {
85e5f08b 5722 match = isWB(WB_UNKNOWN,
ae3bb8ea
KW
5723 getWB_VAL_CP(UCHARAT(locinput -1)),
5724 getWB_VAL_CP(UCHARAT(locinput)),
5725 (U8*) reginfo->strbeg,
5726 (U8*) locinput,
5727 (U8*) reginfo->strend,
5728 utf8_target);
5729 }
5730 break;
64935bc6 5731 }
b2680017 5732 }
5c388b33 5733
64935bc6 5734 if (to_complement ^ ! match) {
5c388b33
KW
5735 sayNO;
5736 }
b2680017 5737 break;
3c0563b9 5738
a4525e78 5739 case ANYOFL: /* /[abc]/l */
780fcc9f 5740 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30
KW
5741
5742 if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE)
5743 {
5744 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
5745 }
780fcc9f 5746 /* FALLTHROUGH */
ac44c12e 5747 case ANYOFD: /* /[abc]/d */
a4525e78 5748 case ANYOF: /* /[abc]/ */
7016d6eb
DM
5749 if (NEXTCHR_IS_EOS)
5750 sayNO;
e0193e47 5751 if (utf8_target) {
3db24e1e
KW
5752 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
5753 utf8_target))
09b08e9b 5754 sayNO;
635cd5d4 5755 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
5756 }
5757 else {
20ed0b26 5758 if (!REGINCLASS(rex, scan, (U8*)locinput))
09b08e9b 5759 sayNO;
3640db6b 5760 locinput++;
e0f9d4a8 5761 }
b8c5462f 5762 break;
3c0563b9 5763
3018b823
KW
5764 /* The argument (FLAGS) to all the POSIX node types is the class number
5765 * */
ee9a90b8 5766
3018b823
KW
5767 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
5768 to_complement = 1;
5769 /* FALLTHROUGH */
5770
5771 case POSIXL: /* \w or [:punct:] etc. under /l */
780fcc9f 5772 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823 5773 if (NEXTCHR_IS_EOS)
bedac28b 5774 sayNO;
bedac28b 5775
3018b823
KW
5776 /* Use isFOO_lc() for characters within Latin1. (Note that
5777 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5778 * wouldn't be invariant) */
5779 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 5780 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
5781 sayNO;
5782 }
5783 }
3018b823
KW
5784 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5785 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
94bb8c36 5786 (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
3018b823
KW
5787 *(locinput + 1))))))
5788 {
bedac28b 5789 sayNO;
3018b823 5790 }
bedac28b 5791 }
3018b823 5792 else { /* Here, must be an above Latin-1 code point */
613abc6d 5793 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
01f55654 5794 goto utf8_posix_above_latin1;
bedac28b 5795 }
3018b823
KW
5796
5797 /* Here, must be utf8 */
5798 locinput += UTF8SKIP(locinput);
bedac28b
KW
5799 break;
5800
3018b823
KW
5801 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
5802 to_complement = 1;
5803 /* FALLTHROUGH */
5804
5805 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 5806 if (utf8_target) {
3018b823 5807 goto utf8_posix;
bedac28b 5808 }
3018b823
KW
5809 goto posixa;
5810
5811 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 5812
3018b823 5813 if (NEXTCHR_IS_EOS) {
bedac28b
KW
5814 sayNO;
5815 }
bedac28b 5816
3018b823
KW
5817 /* All UTF-8 variants match */
5818 if (! UTF8_IS_INVARIANT(nextchr)) {
5819 goto increment_locinput;
bedac28b 5820 }
ee9a90b8 5821
3018b823
KW
5822 to_complement = 1;
5823 /* FALLTHROUGH */
5824
5825 case POSIXA: /* \w or [:punct:] etc. under /a */
5826
5827 posixa:
5828 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
5829 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
5830 * character is a single byte */
20d0b1e9 5831
3018b823
KW
5832 if (NEXTCHR_IS_EOS
5833 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
5834 FLAGS(scan)))))
5835 {
0658cdde
KW
5836 sayNO;
5837 }
3018b823
KW
5838
5839 /* Here we are either not in utf8, or we matched a utf8-invariant,
5840 * so the next char is the next byte */
3640db6b 5841 locinput++;
0658cdde 5842 break;
3c0563b9 5843
3018b823
KW
5844 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
5845 to_complement = 1;
5846 /* FALLTHROUGH */
5847
5848 case POSIXU: /* \w or [:punct:] etc. under /u */
5849 utf8_posix:
5850 if (NEXTCHR_IS_EOS) {
0658cdde
KW
5851 sayNO;
5852 }
3018b823
KW
5853
5854 /* Use _generic_isCC() for characters within Latin1. (Note that
5855 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5856 * wouldn't be invariant) */
5857 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5858 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
5859 FLAGS(scan)))))
5860 {
5861 sayNO;
5862 }
5863 locinput++;
5864 }
5865 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5866 if (! (to_complement
94bb8c36 5867 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
3018b823 5868 *(locinput + 1)),
94bb8c36 5869 FLAGS(scan)))))
3018b823
KW
5870 {
5871 sayNO;
5872 }
5873 locinput += 2;
5874 }
5875 else { /* Handle above Latin-1 code points */
c52b8b12 5876 utf8_posix_above_latin1:
3018b823
KW
5877 classnum = (_char_class_number) FLAGS(scan);
5878 if (classnum < _FIRST_NON_SWASH_CC) {
5879
5880 /* Here, uses a swash to find such code points. Load if if
5881 * not done already */
5882 if (! PL_utf8_swash_ptrs[classnum]) {
5883 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
5884 PL_utf8_swash_ptrs[classnum]
5885 = _core_swash_init("utf8",
2a16ac92
KW
5886 "",
5887 &PL_sv_undef, 1, 0,
5888 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
5889 }
5890 if (! (to_complement
5891 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
5892 (U8 *) locinput, TRUE))))
5893 {
5894 sayNO;
5895 }
5896 }
5897 else { /* Here, uses macros to find above Latin-1 code points */
5898 switch (classnum) {
779cf272 5899 case _CC_ENUM_SPACE:
3018b823
KW
5900 if (! (to_complement
5901 ^ cBOOL(is_XPERLSPACE_high(locinput))))
5902 {
5903 sayNO;
5904 }
5905 break;
5906 case _CC_ENUM_BLANK:
5907 if (! (to_complement
5908 ^ cBOOL(is_HORIZWS_high(locinput))))
5909 {
5910 sayNO;
5911 }
5912 break;
5913 case _CC_ENUM_XDIGIT:
5914 if (! (to_complement
5915 ^ cBOOL(is_XDIGIT_high(locinput))))
5916 {
5917 sayNO;
5918 }
5919 break;
5920 case _CC_ENUM_VERTSPACE:
5921 if (! (to_complement
5922 ^ cBOOL(is_VERTWS_high(locinput))))
5923 {
5924 sayNO;
5925 }
5926 break;
5927 default: /* The rest, e.g. [:cntrl:], can't match
5928 above Latin1 */
5929 if (! to_complement) {
5930 sayNO;
5931 }
5932 break;
5933 }
5934 }
5935 locinput += UTF8SKIP(locinput);
5936 }
5937 break;
0658cdde 5938
37e2e78e
KW
5939 case CLUMP: /* Match \X: logical Unicode character. This is defined as
5940 a Unicode extended Grapheme Cluster */
7016d6eb 5941 if (NEXTCHR_IS_EOS)
a0ed51b3 5942 sayNO;
f2ed9b32 5943 if (! utf8_target) {
37e2e78e
KW
5944
5945 /* Match either CR LF or '.', as all the other possibilities
5946 * require utf8 */
5947 locinput++; /* Match the . or CR */
cc3b396d
KW
5948 if (nextchr == '\r' /* And if it was CR, and the next is LF,
5949 match the LF */
220db18a 5950 && locinput < reginfo->strend
e699a1d5
KW
5951 && UCHARAT(locinput) == '\n')
5952 {
5953 locinput++;
5954 }
37e2e78e
KW
5955 }
5956 else {
5957
64935bc6 5958 /* Get the gcb type for the current character */
85e5f08b 5959 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
64935bc6 5960 (U8*) reginfo->strend);
37e2e78e 5961
64935bc6
KW
5962 /* Then scan through the input until we get to the first
5963 * character whose type is supposed to be a gcb with the
5964 * current character. (There is always a break at the
5965 * end-of-input) */
5966 locinput += UTF8SKIP(locinput);
5967 while (locinput < reginfo->strend) {
85e5f08b 5968 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
64935bc6
KW
5969 (U8*) reginfo->strend);
5970 if (isGCB(prev_gcb, cur_gcb)) {
5971 break;
27d4fc33 5972 }
11dfcd49 5973
64935bc6
KW
5974 prev_gcb = cur_gcb;
5975 locinput += UTF8SKIP(locinput);
5976 }
37e2e78e 5977
37e2e78e 5978
37e2e78e 5979 }
a0ed51b3 5980 break;
81714fb9 5981
3c0563b9 5982 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
5983 { /* The capture buffer cases. The ones beginning with N for the
5984 named buffers just convert to the equivalent numbered and
5985 pretend they were called as the corresponding numbered buffer
5986 op. */
26ecd678
TC
5987 /* don't initialize these in the declaration, it makes C++
5988 unhappy */
9d9163fb 5989 const char *s;
ff1157ca 5990 char type;
8368298a
TC
5991 re_fold_t folder;
5992 const U8 *fold_array;
26ecd678 5993 UV utf8_fold_flags;
8368298a 5994
780fcc9f 5995 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
d7ef4b73
KW
5996 folder = foldEQ_locale;
5997 fold_array = PL_fold_locale;
5998 type = REFFL;
cea315b6 5999 utf8_fold_flags = FOLDEQ_LOCALE;
d7ef4b73
KW
6000 goto do_nref;
6001
3c0563b9 6002 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
6003 folder = foldEQ_latin1;
6004 fold_array = PL_fold_latin1;
6005 type = REFFA;
6006 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6007 goto do_nref;
6008
3c0563b9 6009 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
6010 folder = foldEQ_latin1;
6011 fold_array = PL_fold_latin1;
6012 type = REFFU;
d513472c 6013 utf8_fold_flags = 0;
d7ef4b73
KW
6014 goto do_nref;
6015
3c0563b9 6016 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
6017 folder = foldEQ;
6018 fold_array = PL_fold;
6019 type = REFF;
d513472c 6020 utf8_fold_flags = 0;
d7ef4b73
KW
6021 goto do_nref;
6022
3c0563b9 6023 case NREF: /* /\g{name}/ */
d7ef4b73 6024 type = REF;
83d7b90b
KW
6025 folder = NULL;
6026 fold_array = NULL;
d513472c 6027 utf8_fold_flags = 0;
d7ef4b73
KW
6028 do_nref:
6029
6030 /* For the named back references, find the corresponding buffer
6031 * number */
0a4db386
YO
6032 n = reg_check_named_buff_matched(rex,scan);
6033
d7ef4b73 6034 if ( ! n ) {
81714fb9 6035 sayNO;
d7ef4b73
KW
6036 }
6037 goto do_nref_ref_common;
6038
3c0563b9 6039 case REFFL: /* /\1/il */
780fcc9f 6040 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
d7ef4b73
KW
6041 folder = foldEQ_locale;
6042 fold_array = PL_fold_locale;
cea315b6 6043 utf8_fold_flags = FOLDEQ_LOCALE;
d7ef4b73
KW
6044 goto do_ref;
6045
3c0563b9 6046 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
6047 folder = foldEQ_latin1;
6048 fold_array = PL_fold_latin1;
6049 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6050 goto do_ref;
6051
3c0563b9 6052 case REFFU: /* /\1/iu */
d7ef4b73
KW
6053 folder = foldEQ_latin1;
6054 fold_array = PL_fold_latin1;
d513472c 6055 utf8_fold_flags = 0;
d7ef4b73
KW
6056 goto do_ref;
6057
3c0563b9 6058 case REFF: /* /\1/i */
d7ef4b73
KW
6059 folder = foldEQ;
6060 fold_array = PL_fold;
d513472c 6061 utf8_fold_flags = 0;
83d7b90b 6062 goto do_ref;
d7ef4b73 6063
3c0563b9 6064 case REF: /* /\1/ */
83d7b90b
KW
6065 folder = NULL;
6066 fold_array = NULL;
d513472c 6067 utf8_fold_flags = 0;
83d7b90b 6068
d7ef4b73 6069 do_ref:
81714fb9 6070 type = OP(scan);
d7ef4b73
KW
6071 n = ARG(scan); /* which paren pair */
6072
6073 do_nref_ref_common:
b93070ed 6074 ln = rex->offs[n].start;
1cb48e53 6075 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
b93070ed 6076 if (rex->lastparen < n || ln == -1)
af3f8c16 6077 sayNO; /* Do not match unless seen CLOSEn. */
b93070ed 6078 if (ln == rex->offs[n].end)
a0d0e21e 6079 break;
a0ed51b3 6080
9d9163fb 6081 s = reginfo->strbeg + ln;
d7ef4b73 6082 if (type != REF /* REF can do byte comparison */
31f05a37
KW
6083 && (utf8_target || type == REFFU || type == REFFL))
6084 {
220db18a 6085 char * limit = reginfo->strend;
d7ef4b73
KW
6086
6087 /* This call case insensitively compares the entire buffer
6088 * at s, with the current input starting at locinput, but
220db18a
DM
6089 * not going off the end given by reginfo->strend, and
6090 * returns in <limit> upon success, how much of the
6091 * current input was matched */
b93070ed 6092 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
d513472c 6093 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
6094 {
6095 sayNO;
a0ed51b3 6096 }
d7ef4b73 6097 locinput = limit;
a0ed51b3
LW
6098 break;
6099 }
6100
d7ef4b73 6101 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
6102 if (!NEXTCHR_IS_EOS &&
6103 UCHARAT(s) != nextchr &&
81714fb9 6104 (type == REF ||
d7ef4b73 6105 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 6106 sayNO;
b93070ed 6107 ln = rex->offs[n].end - ln;
220db18a 6108 if (locinput + ln > reginfo->strend)
4633a7c4 6109 sayNO;
81714fb9 6110 if (ln > 1 && (type == REF
24d3c4a9 6111 ? memNE(s, locinput, ln)
d7ef4b73 6112 : ! folder(s, locinput, ln)))
4633a7c4 6113 sayNO;
24d3c4a9 6114 locinput += ln;
a0d0e21e 6115 break;
81714fb9 6116 }
3c0563b9
DM
6117
6118 case NOTHING: /* null op; e.g. the 'nothing' following
6119 * the '*' in m{(a+|b)*}' */
6120 break;
6121 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 6122 break;
3c0563b9 6123
40a82448
DM
6124#undef ST
6125#define ST st->u.eval
c277df42 6126 {
c277df42 6127 SV *ret;
d2f13c59 6128 REGEXP *re_sv;
6bda09f9 6129 regexp *re;
f8fc2ecf 6130 regexp_internal *rei;
1a147d38
YO
6131 regnode *startpoint;
6132
3c0563b9 6133 case GOSTART: /* (?R) */
e7707071
YO
6134 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
6135 if (cur_eval && cur_eval->locinput==locinput) {
24b23f37 6136 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
1a147d38 6137 Perl_croak(aTHX_ "Infinite recursion in regex");
4b196cd4 6138 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
6139 Perl_croak(aTHX_
6140 "Pattern subroutine nesting without pos change"
6141 " exceeded limit in regex");
6bda09f9
YO
6142 } else {
6143 nochange_depth = 0;
1a147d38 6144 }
288b8c02 6145 re_sv = rex_sv;
6bda09f9 6146 re = rex;
f8fc2ecf 6147 rei = rexi;
1a147d38 6148 if (OP(scan)==GOSUB) {
6bda09f9
YO
6149 startpoint = scan + ARG2L(scan);
6150 ST.close_paren = ARG(scan);
6151 } else {
f8fc2ecf 6152 startpoint = rei->program+1;
6bda09f9
YO
6153 ST.close_paren = 0;
6154 }
d1b2014a
YO
6155
6156 /* Save all the positions seen so far. */
6157 ST.cp = regcppush(rex, 0, maxopenparen);
6158 REGCP_SET(ST.lastcp);
6159
6160 /* and then jump to the code we share with EVAL */
6bda09f9 6161 goto eval_recurse_doit;
a74ff37d 6162 /* NOTREACHED */
3c0563b9 6163
6bda09f9
YO
6164 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
6165 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 6166 if ( ++nochange_depth > max_nochange_depth )
1a147d38 6167 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
6168 } else {
6169 nochange_depth = 0;
6170 }
8e5e9ebe 6171 {
4aabdb9b 6172 /* execute the code in the {...} */
81ed78b2 6173
4aabdb9b 6174 dSP;
a6dc34f1 6175 IV before;
1f4d1a1e 6176 OP * const oop = PL_op;
4aabdb9b 6177 COP * const ocurcop = PL_curcop;
81ed78b2 6178 OP *nop;
81ed78b2 6179 CV *newcv;
91332126 6180
74088413 6181 /* save *all* paren positions */
92da3157 6182 regcppush(rex, 0, maxopenparen);
74088413
DM
6183 REGCP_SET(runops_cp);
6184
81ed78b2
DM
6185 if (!caller_cv)
6186 caller_cv = find_runcv(NULL);
6187
4aabdb9b 6188 n = ARG(scan);
81ed78b2 6189
b30fcab9 6190 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8d919b0a 6191 newcv = (ReANY(
b30fcab9
DM
6192 (REGEXP*)(rexi->data->data[n])
6193 ))->qr_anoncv
81ed78b2
DM
6194 ;
6195 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
6196 }
6197 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
6198 newcv = caller_cv;
6199 nop = (OP*)rexi->data->data[n];
6200 assert(CvDEPTH(newcv));
68e2671b
DM
6201 }
6202 else {
d24ca0c5
DM
6203 /* literal with own CV */
6204 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
6205 newcv = rex->qr_anoncv;
6206 nop = (OP*)rexi->data->data[n];
68e2671b 6207 }
81ed78b2 6208
0e458318
DM
6209 /* normally if we're about to execute code from the same
6210 * CV that we used previously, we just use the existing
6211 * CX stack entry. However, its possible that in the
6212 * meantime we may have backtracked, popped from the save
6213 * stack, and undone the SAVECOMPPAD(s) associated with
6214 * PUSH_MULTICALL; in which case PL_comppad no longer
6215 * points to newcv's pad. */
6216 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6217 {
b0065247
DM
6218 U8 flags = (CXp_SUB_RE |
6219 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
0e458318 6220 if (last_pushed_cv) {
b0065247 6221 CHANGE_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
6222 }
6223 else {
b0065247 6224 PUSH_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
6225 }
6226 last_pushed_cv = newcv;
6227 }
c31ee3bb
DM
6228 else {
6229 /* these assignments are just to silence compiler
6230 * warnings */
6231 multicall_cop = NULL;
6232 newsp = NULL;
6233 }
0e458318
DM
6234 last_pad = PL_comppad;
6235
2e2e3f36
DM
6236 /* the initial nextstate you would normally execute
6237 * at the start of an eval (which would cause error
6238 * messages to come from the eval), may be optimised
6239 * away from the execution path in the regex code blocks;
6240 * so manually set PL_curcop to it initially */
6241 {
81ed78b2 6242 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
6243 assert(o->op_type == OP_NULL);
6244 if (o->op_targ == OP_SCOPE) {
6245 o = cUNOPo->op_first;
6246 }
6247 else {
6248 assert(o->op_targ == OP_LEAVE);
6249 o = cUNOPo->op_first;
6250 assert(o->op_type == OP_ENTER);
e6dae479 6251 o = OpSIBLING(o);
2e2e3f36
DM
6252 }
6253
6254 if (o->op_type != OP_STUB) {
6255 assert( o->op_type == OP_NEXTSTATE
6256 || o->op_type == OP_DBSTATE
6257 || (o->op_type == OP_NULL
6258 && ( o->op_targ == OP_NEXTSTATE
6259 || o->op_targ == OP_DBSTATE
6260 )
6261 )
6262 );
6263 PL_curcop = (COP*)o;
6264 }
6265 }
81ed78b2 6266 nop = nop->op_next;
2e2e3f36 6267
24b23f37 6268 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
81ed78b2
DM
6269 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6270
8adc0f72 6271 rex->offs[0].end = locinput - reginfo->strbeg;
bf2039a9 6272 if (reginfo->info_aux_eval->pos_magic)
25fdce4a
FC
6273 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6274 reginfo->sv, reginfo->strbeg,
6275 locinput - reginfo->strbeg);
4aabdb9b 6276
2bf803e2
YO
6277 if (sv_yes_mark) {
6278 SV *sv_mrk = get_sv("REGMARK", 1);
6279 sv_setsv(sv_mrk, sv_yes_mark);
6280 }
6281
81ed78b2
DM
6282 /* we don't use MULTICALL here as we want to call the
6283 * first op of the block of interest, rather than the
6284 * first op of the sub */
a6dc34f1 6285 before = (IV)(SP-PL_stack_base);
81ed78b2 6286 PL_op = nop;
8e5e9ebe
RGS
6287 CALLRUNOPS(aTHX); /* Scalar context. */
6288 SPAGAIN;
a6dc34f1 6289 if ((IV)(SP-PL_stack_base) == before)
075aa684 6290 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
6291 else {
6292 ret = POPs;
6293 PUTBACK;
6294 }
4aabdb9b 6295
e4bfbed3
DM
6296 /* before restoring everything, evaluate the returned
6297 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
6298 * PL_op or pad. Also need to process any magic vars
6299 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
6300
6301 PL_op = NULL;
6302
5e98dac2 6303 re_sv = NULL;
e4bfbed3
DM
6304 if (logical == 0) /* (?{})/ */
6305 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6306 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
6307 sw = cBOOL(SvTRUE(ret));
6308 logical = 0;
6309 }
6310 else { /* /(??{}) */
497d0a96
DM
6311 /* if its overloaded, let the regex compiler handle
6312 * it; otherwise extract regex, or stringify */
237da807 6313 if (SvGMAGICAL(ret))
2685dc2d 6314 ret = sv_mortalcopy(ret);
497d0a96
DM
6315 if (!SvAMAGIC(ret)) {
6316 SV *sv = ret;
6317 if (SvROK(sv))
6318 sv = SvRV(sv);
6319 if (SvTYPE(sv) == SVt_REGEXP)
6320 re_sv = (REGEXP*) sv;
63620942
FC
6321 else if (SvSMAGICAL(ret)) {
6322 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
497d0a96
DM
6323 if (mg)
6324 re_sv = (REGEXP *) mg->mg_obj;
6325 }
e4bfbed3 6326
2685dc2d 6327 /* force any undef warnings here */
237da807
FC
6328 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6329 ret = sv_mortalcopy(ret);
497d0a96
DM
6330 (void) SvPV_force_nolen(ret);
6331 }
e4bfbed3
DM
6332 }
6333
6334 }
6335
81ed78b2
DM
6336 /* *** Note that at this point we don't restore
6337 * PL_comppad, (or pop the CxSUB) on the assumption it may
6338 * be used again soon. This is safe as long as nothing
6339 * in the regexp code uses the pad ! */
4aabdb9b 6340 PL_op = oop;
4aabdb9b 6341 PL_curcop = ocurcop;
92da3157 6342 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
f5df269c 6343 PL_curpm = PL_reg_curpm;
e4bfbed3
DM
6344
6345 if (logical != 2)
4aabdb9b 6346 break;
8e5e9ebe 6347 }
e4bfbed3
DM
6348
6349 /* only /(??{})/ from now on */
24d3c4a9 6350 logical = 0;
4aabdb9b 6351 {
4f639d21
DM
6352 /* extract RE object from returned value; compiling if
6353 * necessary */
5c35adbb 6354
575c37f6
DM
6355 if (re_sv) {
6356 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 6357 }
0f5d15d6 6358 else {
c737faaf 6359 U32 pm_flags = 0;
0f5d15d6 6360
9753d940
DM
6361 if (SvUTF8(ret) && IN_BYTES) {
6362 /* In use 'bytes': make a copy of the octet
6363 * sequence, but without the flag on */
b9ad30b4
NC
6364 STRLEN len;
6365 const char *const p = SvPV(ret, len);
6366 ret = newSVpvn_flags(p, len, SVs_TEMP);
6367 }
732caac7
DM
6368 if (rex->intflags & PREGf_USE_RE_EVAL)
6369 pm_flags |= PMf_USE_RE_EVAL;
6370
6371 /* if we got here, it should be an engine which
6372 * supports compiling code blocks and stuff */
6373 assert(rex->engine && rex->engine->op_comp);
ec841a27 6374 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 6375 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27 6376 rex->engine, NULL, NULL,
33be4c61 6377 /* copy /msixn etc to inner pattern */
13f27704 6378 ARG2L(scan),
ec841a27 6379 pm_flags);
732caac7 6380
9041c2e3 6381 if (!(SvFLAGS(ret)
237da807
FC
6382 & (SVs_TEMP | SVs_GMG | SVf_ROK))
6383 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
a2794585
NC
6384 /* This isn't a first class regexp. Instead, it's
6385 caching a regexp onto an existing, Perl visible
6386 scalar. */
575c37f6 6387 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 6388 }
0f5d15d6 6389 }
e1ff3a88 6390 SAVEFREESV(re_sv);
8d919b0a 6391 re = ReANY(re_sv);
4aabdb9b 6392 }
07bc277f 6393 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
6394 re->subbeg = rex->subbeg;
6395 re->sublen = rex->sublen;
6502e081
DM
6396 re->suboffset = rex->suboffset;
6397 re->subcoffset = rex->subcoffset;
d1b2014a
YO
6398 re->lastparen = 0;
6399 re->lastcloseparen = 0;
f8fc2ecf 6400 rei = RXi_GET(re);
6bda09f9 6401 DEBUG_EXECUTE_r(
220db18a
DM
6402 debug_start_match(re_sv, utf8_target, locinput,
6403 reginfo->strend, "Matching embedded");
6bda09f9 6404 );
f8fc2ecf 6405 startpoint = rei->program + 1;
1a147d38 6406 ST.close_paren = 0; /* only used for GOSUB */
d1b2014a
YO
6407 /* Save all the seen positions so far. */
6408 ST.cp = regcppush(rex, 0, maxopenparen);
6409 REGCP_SET(ST.lastcp);
6410 /* and set maxopenparen to 0, since we are starting a "fresh" match */
6411 maxopenparen = 0;
6412 /* run the pattern returned from (??{...}) */
aa283a38 6413
c52b8b12 6414 eval_recurse_doit: /* Share code with GOSUB below this line
d1b2014a
YO
6415 * At this point we expect the stack context to be
6416 * set up correctly */
4aabdb9b 6417
1cb95af7
DM
6418 /* invalidate the S-L poscache. We're now executing a
6419 * different set of WHILEM ops (and their associated
6420 * indexes) against the same string, so the bits in the
6421 * cache are meaningless. Setting maxiter to zero forces
6422 * the cache to be invalidated and zeroed before reuse.
6423 * XXX This is too dramatic a measure. Ideally we should
6424 * save the old cache and restore when running the outer
6425 * pattern again */
1cb48e53 6426 reginfo->poscache_maxiter = 0;
4aabdb9b 6427
d1b2014a 6428 /* the new regexp might have a different is_utf8_pat than we do */
aed7b151 6429 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 6430
288b8c02 6431 ST.prev_rex = rex_sv;
faec1544 6432 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
6433 rex_sv = re_sv;
6434 SET_reg_curpm(rex_sv);
288b8c02 6435 rex = re;
f8fc2ecf 6436 rexi = rei;
faec1544 6437 cur_curlyx = NULL;
40a82448 6438 ST.B = next;
faec1544
DM
6439 ST.prev_eval = cur_eval;
6440 cur_eval = st;
faec1544 6441 /* now continue from first node in postoned RE */
4d5016e5 6442 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
a74ff37d 6443 /* NOTREACHED */
661d43c4 6444 NOT_REACHED; /* NOTREACHED */
c277df42 6445 }
40a82448 6446
faec1544
DM
6447 case EVAL_AB: /* cleanup after a successful (??{A})B */
6448 /* note: this is called twice; first after popping B, then A */
ec43f78b 6449 rex_sv = ST.prev_rex;
aed7b151 6450 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 6451 SET_reg_curpm(rex_sv);
8d919b0a 6452 rex = ReANY(rex_sv);
f8fc2ecf 6453 rexi = RXi_GET(rex);
4b22688e
YO
6454 {
6455 /* preserve $^R across LEAVE's. See Bug 121070. */
6456 SV *save_sv= GvSV(PL_replgv);
6457 SvREFCNT_inc(save_sv);
6458 regcpblow(ST.cp); /* LEAVE in disguise */
6459 sv_setsv(GvSV(PL_replgv), save_sv);
6460 SvREFCNT_dec(save_sv);
6461 }
faec1544
DM
6462 cur_eval = ST.prev_eval;
6463 cur_curlyx = ST.prev_curlyx;
34a81e2b 6464
1cb95af7 6465 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 6466 reginfo->poscache_maxiter = 0;
e7707071 6467 if ( nochange_depth )
4b196cd4 6468 nochange_depth--;
262b90c4 6469 sayYES;
40a82448 6470
40a82448 6471
faec1544
DM
6472 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
6473 /* note: this is called twice; first after popping B, then A */
ec43f78b 6474 rex_sv = ST.prev_rex;
aed7b151 6475 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 6476 SET_reg_curpm(rex_sv);
8d919b0a 6477 rex = ReANY(rex_sv);
f8fc2ecf 6478 rexi = RXi_GET(rex);
0357f1fd 6479
40a82448 6480 REGCP_UNWIND(ST.lastcp);
92da3157 6481 regcppop(rex, &maxopenparen);
faec1544
DM
6482 cur_eval = ST.prev_eval;
6483 cur_curlyx = ST.prev_curlyx;
1cb95af7 6484 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 6485 reginfo->poscache_maxiter = 0;
e7707071 6486 if ( nochange_depth )
4b196cd4 6487 nochange_depth--;
40a82448 6488 sayNO_SILENT;
40a82448
DM
6489#undef ST
6490
3c0563b9 6491 case OPEN: /* ( */
c277df42 6492 n = ARG(scan); /* which paren pair */
9d9163fb 6493 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
92da3157
DM
6494 if (n > maxopenparen)
6495 maxopenparen = n;
495f47a5 6496 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
92da3157 6497 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
495f47a5
DM
6498 PTR2UV(rex),
6499 PTR2UV(rex->offs),
6500 (UV)n,
6501 (IV)rex->offs[n].start_tmp,
92da3157 6502 (UV)maxopenparen
495f47a5 6503 ));
e2e6a0f1 6504 lastopen = n;
a0d0e21e 6505 break;
495f47a5
DM
6506
6507/* XXX really need to log other places start/end are set too */
6508#define CLOSE_CAPTURE \
6509 rex->offs[n].start = rex->offs[n].start_tmp; \
9d9163fb 6510 rex->offs[n].end = locinput - reginfo->strbeg; \
495f47a5
DM
6511 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
6512 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
6513 PTR2UV(rex), \
6514 PTR2UV(rex->offs), \
6515 (UV)n, \
6516 (IV)rex->offs[n].start, \
6517 (IV)rex->offs[n].end \
6518 ))
6519
3c0563b9 6520 case CLOSE: /* ) */
c277df42 6521 n = ARG(scan); /* which paren pair */
495f47a5 6522 CLOSE_CAPTURE;
b93070ed
DM
6523 if (n > rex->lastparen)
6524 rex->lastparen = n;
6525 rex->lastcloseparen = n;
3b6647e0 6526 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6bda09f9
YO
6527 goto fake_end;
6528 }
a0d0e21e 6529 break;
3c0563b9
DM
6530
6531 case ACCEPT: /* (*ACCEPT) */
e2e6a0f1
YO
6532 if (ARG(scan)){
6533 regnode *cursor;
6534 for (cursor=scan;
6535 cursor && OP(cursor)!=END;
6536 cursor=regnext(cursor))
6537 {
6538 if ( OP(cursor)==CLOSE ){
6539 n = ARG(cursor);
6540 if ( n <= lastopen ) {
495f47a5 6541 CLOSE_CAPTURE;
b93070ed
DM
6542 if (n > rex->lastparen)
6543 rex->lastparen = n;
6544 rex->lastcloseparen = n;
3b6647e0
RB
6545 if ( n == ARG(scan) || (cur_eval &&
6546 cur_eval->u.eval.close_paren == n))
e2e6a0f1
YO
6547 break;
6548 }
6549 }
6550 }
6551 }
6552 goto fake_end;
a74ff37d 6553 /* NOTREACHED */
3c0563b9
DM
6554
6555 case GROUPP: /* (?(1)) */
c277df42 6556 n = ARG(scan); /* which paren pair */
b93070ed 6557 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 6558 break;
3c0563b9
DM
6559
6560 case NGROUPP: /* (?(<name>)) */
0a4db386 6561 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 6562 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 6563 break;
3c0563b9
DM
6564
6565 case INSUBP: /* (?(R)) */
0a4db386 6566 n = ARG(scan);
3b6647e0 6567 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
0a4db386 6568 break;
3c0563b9
DM
6569
6570 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
6571 sw = 0;
6572 break;
3c0563b9
DM
6573
6574 case IFTHEN: /* (?(cond)A|B) */
1cb48e53 6575 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
24d3c4a9 6576 if (sw)
c277df42
IZ
6577 next = NEXTOPER(NEXTOPER(scan));
6578 else {
6579 next = scan + ARG(scan);
6580 if (OP(next) == IFTHEN) /* Fake one. */
6581 next = NEXTOPER(NEXTOPER(next));
6582 }
6583 break;
3c0563b9
DM
6584
6585 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 6586 logical = scan->flags;
c277df42 6587 break;
c476f425 6588
2ab05381 6589/*******************************************************************
2ab05381 6590
c476f425
DM
6591The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
6592pattern, where A and B are subpatterns. (For simple A, CURLYM or
6593STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 6594
c476f425 6595A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 6596
c476f425
DM
6597On entry to the subpattern, CURLYX is called. This pushes a CURLYX
6598state, which contains the current count, initialised to -1. It also sets
6599cur_curlyx to point to this state, with any previous value saved in the
6600state block.
2ab05381 6601
c476f425
DM
6602CURLYX then jumps straight to the WHILEM op, rather than executing A,
6603since the pattern may possibly match zero times (i.e. it's a while {} loop
6604rather than a do {} while loop).
2ab05381 6605
c476f425
DM
6606Each entry to WHILEM represents a successful match of A. The count in the
6607CURLYX block is incremented, another WHILEM state is pushed, and execution
6608passes to A or B depending on greediness and the current count.
2ab05381 6609
c476f425
DM
6610For example, if matching against the string a1a2a3b (where the aN are
6611substrings that match /A/), then the match progresses as follows: (the
6612pushed states are interspersed with the bits of strings matched so far):
2ab05381 6613
c476f425
DM
6614 <CURLYX cnt=-1>
6615 <CURLYX cnt=0><WHILEM>
6616 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
6617 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
6618 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
6619 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 6620
c476f425
DM
6621(Contrast this with something like CURLYM, which maintains only a single
6622backtrack state:
2ab05381 6623
c476f425
DM
6624 <CURLYM cnt=0> a1
6625 a1 <CURLYM cnt=1> a2
6626 a1 a2 <CURLYM cnt=2> a3
6627 a1 a2 a3 <CURLYM cnt=3> b
6628)
2ab05381 6629
c476f425
DM
6630Each WHILEM state block marks a point to backtrack to upon partial failure
6631of A or B, and also contains some minor state data related to that
6632iteration. The CURLYX block, pointed to by cur_curlyx, contains the
6633overall state, such as the count, and pointers to the A and B ops.
2ab05381 6634
c476f425
DM
6635This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
6636must always point to the *current* CURLYX block, the rules are:
2ab05381 6637
c476f425
DM
6638When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
6639and set cur_curlyx to point the new block.
2ab05381 6640
c476f425
DM
6641When popping the CURLYX block after a successful or unsuccessful match,
6642restore the previous cur_curlyx.
2ab05381 6643
c476f425
DM
6644When WHILEM is about to execute B, save the current cur_curlyx, and set it
6645to the outer one saved in the CURLYX block.
2ab05381 6646
c476f425
DM
6647When popping the WHILEM block after a successful or unsuccessful B match,
6648restore the previous cur_curlyx.
2ab05381 6649
c476f425
DM
6650Here's an example for the pattern (AI* BI)*BO
6651I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 6652
c476f425
DM
6653cur_
6654curlyx backtrack stack
6655------ ---------------
6656NULL
6657CO <CO prev=NULL> <WO>
6658CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6659CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6660NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 6661
c476f425
DM
6662At this point the pattern succeeds, and we work back down the stack to
6663clean up, restoring as we go:
95b24440 6664
c476f425
DM
6665CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6666CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6667CO <CO prev=NULL> <WO>
6668NULL
a0374537 6669
c476f425
DM
6670*******************************************************************/
6671
6672#define ST st->u.curlyx
6673
6674 case CURLYX: /* start of /A*B/ (for complex A) */
6675 {
6676 /* No need to save/restore up to this paren */
6677 I32 parenfloor = scan->flags;
6678
6679 assert(next); /* keep Coverity happy */
6680 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
6681 next += ARG(next);
6682
6683 /* XXXX Probably it is better to teach regpush to support
92da3157 6684 parenfloor > maxopenparen ... */
b93070ed
DM
6685 if (parenfloor > (I32)rex->lastparen)
6686 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
6687
6688 ST.prev_curlyx= cur_curlyx;
6689 cur_curlyx = st;
6690 ST.cp = PL_savestack_ix;
6691
6692 /* these fields contain the state of the current curly.
6693 * they are accessed by subsequent WHILEMs */
6694 ST.parenfloor = parenfloor;
d02d6d97 6695 ST.me = scan;
c476f425 6696 ST.B = next;
24d3c4a9
DM
6697 ST.minmod = minmod;
6698 minmod = 0;
c476f425
DM
6699 ST.count = -1; /* this will be updated by WHILEM */
6700 ST.lastloc = NULL; /* this will be updated by WHILEM */
6701
4d5016e5 6702 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
a74ff37d 6703 /* NOTREACHED */
661d43c4 6704 NOT_REACHED; /* NOTREACHED */
c476f425 6705 }
a0d0e21e 6706
c476f425 6707 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
6708 cur_curlyx = ST.prev_curlyx;
6709 sayYES;
a74ff37d 6710 /* NOTREACHED */
661d43c4 6711 NOT_REACHED; /* NOTREACHED */
a0d0e21e 6712
c476f425
DM
6713 case CURLYX_end_fail: /* just failed to match all of A*B */
6714 regcpblow(ST.cp);
6715 cur_curlyx = ST.prev_curlyx;
6716 sayNO;
a74ff37d 6717 /* NOTREACHED */
661d43c4 6718 NOT_REACHED; /* NOTREACHED */
4633a7c4 6719
a0d0e21e 6720
c476f425
DM
6721#undef ST
6722#define ST st->u.whilem
6723
6724 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
6725 {
6726 /* see the discussion above about CURLYX/WHILEM */
c476f425 6727 I32 n;
1a522d5d
JH
6728 int min, max;
6729 regnode *A;
d02d6d97 6730
c476f425 6731 assert(cur_curlyx); /* keep Coverity happy */
1a522d5d
JH
6732
6733 min = ARG1(cur_curlyx->u.curlyx.me);
6734 max = ARG2(cur_curlyx->u.curlyx.me);
6735 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
c476f425
DM
6736 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
6737 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
6738 ST.cache_offset = 0;
6739 ST.cache_mask = 0;
6740
c476f425
DM
6741
6742 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
d02d6d97
DM
6743 "%*s whilem: matched %ld out of %d..%d\n",
6744 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
c476f425 6745 );
a0d0e21e 6746
c476f425 6747 /* First just match a string of min A's. */
a0d0e21e 6748
d02d6d97 6749 if (n < min) {
92da3157
DM
6750 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6751 maxopenparen);
c476f425 6752 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
6753 REGCP_SET(ST.lastcp);
6754
4d5016e5 6755 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
a74ff37d 6756 /* NOTREACHED */
661d43c4 6757 NOT_REACHED; /* NOTREACHED */
c476f425
DM
6758 }
6759
6760 /* If degenerate A matches "", assume A done. */
6761
6762 if (locinput == cur_curlyx->u.curlyx.lastloc) {
6763 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6764 "%*s whilem: empty match detected, trying continuation...\n",
6765 REPORT_CODE_OFF+depth*2, "")
6766 );
6767 goto do_whilem_B_max;
6768 }
6769
1cb95af7
DM
6770 /* super-linear cache processing.
6771 *
6772 * The idea here is that for certain types of CURLYX/WHILEM -
6773 * principally those whose upper bound is infinity (and
6774 * excluding regexes that have things like \1 and other very
6775 * non-regular expresssiony things), then if a pattern like
6776 * /....A*.../ fails and we backtrack to the WHILEM, then we
6777 * make a note that this particular WHILEM op was at string
6778 * position 47 (say) when the rest of pattern failed. Then, if
6779 * we ever find ourselves back at that WHILEM, and at string
6780 * position 47 again, we can just fail immediately rather than
6781 * running the rest of the pattern again.
6782 *
6783 * This is very handy when patterns start to go
6784 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
6785 * with a combinatorial explosion of backtracking.
6786 *
6787 * The cache is implemented as a bit array, with one bit per
6788 * string byte position per WHILEM op (up to 16) - so its
6789 * between 0.25 and 2x the string size.
6790 *
6791 * To avoid allocating a poscache buffer every time, we do an
6792 * initially countdown; only after we have executed a WHILEM
6793 * op (string-length x #WHILEMs) times do we allocate the
6794 * cache.
6795 *
6796 * The top 4 bits of scan->flags byte say how many different
6797 * relevant CURLLYX/WHILEM op pairs there are, while the
6798 * bottom 4-bits is the identifying index number of this
6799 * WHILEM.
6800 */
c476f425
DM
6801
6802 if (scan->flags) {
a0d0e21e 6803
1cb48e53 6804 if (!reginfo->poscache_maxiter) {
c476f425
DM
6805 /* start the countdown: Postpone detection until we
6806 * know the match is not *that* much linear. */
1cb48e53 6807 reginfo->poscache_maxiter
9d9163fb
DM
6808 = (reginfo->strend - reginfo->strbeg + 1)
6809 * (scan->flags>>4);
66bf836d 6810 /* possible overflow for long strings and many CURLYX's */
1cb48e53
DM
6811 if (reginfo->poscache_maxiter < 0)
6812 reginfo->poscache_maxiter = I32_MAX;
6813 reginfo->poscache_iter = reginfo->poscache_maxiter;
2c2d71f5 6814 }
c476f425 6815
1cb48e53 6816 if (reginfo->poscache_iter-- == 0) {
c476f425 6817 /* initialise cache */
ea3daa5d 6818 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
2ac8ff4b
DM
6819 regmatch_info_aux *const aux = reginfo->info_aux;
6820 if (aux->poscache) {
ea3daa5d 6821 if ((SSize_t)reginfo->poscache_size < size) {
2ac8ff4b
DM
6822 Renew(aux->poscache, size, char);
6823 reginfo->poscache_size = size;
2c2d71f5 6824 }
2ac8ff4b 6825 Zero(aux->poscache, size, char);
2c2d71f5
JH
6826 }
6827 else {
2ac8ff4b
DM
6828 reginfo->poscache_size = size;
6829 Newxz(aux->poscache, size, char);
2c2d71f5 6830 }
c476f425
DM
6831 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6832 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
6833 PL_colors[4], PL_colors[5])
6834 );
2c2d71f5 6835 }
c476f425 6836
1cb48e53 6837 if (reginfo->poscache_iter < 0) {
c476f425 6838 /* have we already failed at this position? */
ea3daa5d 6839 SSize_t offset, mask;
338e600a
DM
6840
6841 reginfo->poscache_iter = -1; /* stop eventual underflow */
c476f425 6842 offset = (scan->flags & 0xf) - 1
9d9163fb
DM
6843 + (locinput - reginfo->strbeg)
6844 * (scan->flags>>4);
c476f425
DM
6845 mask = 1 << (offset % 8);
6846 offset /= 8;
2ac8ff4b 6847 if (reginfo->info_aux->poscache[offset] & mask) {
c476f425
DM
6848 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6849 "%*s whilem: (cache) already tried at this position...\n",
6850 REPORT_CODE_OFF+depth*2, "")
2c2d71f5 6851 );
3298f257 6852 sayNO; /* cache records failure */
2c2d71f5 6853 }
c476f425
DM
6854 ST.cache_offset = offset;
6855 ST.cache_mask = mask;
2c2d71f5 6856 }
c476f425 6857 }
2c2d71f5 6858
c476f425 6859 /* Prefer B over A for minimal matching. */
a687059c 6860
c476f425
DM
6861 if (cur_curlyx->u.curlyx.minmod) {
6862 ST.save_curlyx = cur_curlyx;
6863 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
92da3157
DM
6864 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
6865 maxopenparen);
c476f425 6866 REGCP_SET(ST.lastcp);
4d5016e5
DM
6867 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
6868 locinput);
a74ff37d 6869 /* NOTREACHED */
661d43c4 6870 NOT_REACHED; /* NOTREACHED */
c476f425 6871 }
a0d0e21e 6872
c476f425
DM
6873 /* Prefer A over B for maximal matching. */
6874
d02d6d97 6875 if (n < max) { /* More greed allowed? */
92da3157
DM
6876 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6877 maxopenparen);
c476f425
DM
6878 cur_curlyx->u.curlyx.lastloc = locinput;
6879 REGCP_SET(ST.lastcp);
4d5016e5 6880 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
a74ff37d 6881 /* NOTREACHED */
661d43c4 6882 NOT_REACHED; /* NOTREACHED */
c476f425
DM
6883 }
6884 goto do_whilem_B_max;
6885 }
a74ff37d 6886 /* NOTREACHED */
661d43c4 6887 NOT_REACHED; /* NOTREACHED */
c476f425
DM
6888
6889 case WHILEM_B_min: /* just matched B in a minimal match */
6890 case WHILEM_B_max: /* just matched B in a maximal match */
6891 cur_curlyx = ST.save_curlyx;
6892 sayYES;
a74ff37d 6893 /* NOTREACHED */
661d43c4 6894 NOT_REACHED; /* NOTREACHED */
c476f425
DM
6895
6896 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
6897 cur_curlyx = ST.save_curlyx;
6898 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6899 cur_curlyx->u.curlyx.count--;
6900 CACHEsayNO;
a74ff37d 6901 /* NOTREACHED */
661d43c4 6902 NOT_REACHED; /* NOTREACHED */
c476f425
DM
6903
6904 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
924ba076 6905 /* FALLTHROUGH */
c476f425 6906 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 6907 REGCP_UNWIND(ST.lastcp);
92da3157 6908 regcppop(rex, &maxopenparen);
c476f425
DM
6909 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6910 cur_curlyx->u.curlyx.count--;
6911 CACHEsayNO;
a74ff37d 6912 /* NOTREACHED */
661d43c4 6913 NOT_REACHED; /* NOTREACHED */
c476f425
DM
6914
6915 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
6916 REGCP_UNWIND(ST.lastcp);
92da3157 6917 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
c476f425
DM
6918 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6919 "%*s whilem: failed, trying continuation...\n",
6920 REPORT_CODE_OFF+depth*2, "")
6921 );
6922 do_whilem_B_max:
6923 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6924 && ckWARN(WARN_REGEXP)
39819bd9 6925 && !reginfo->warned)
c476f425 6926 {
39819bd9 6927 reginfo->warned = TRUE;
dcbac5bb
FC
6928 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6929 "Complex regular subexpression recursion limit (%d) "
6930 "exceeded",
c476f425
DM
6931 REG_INFTY - 1);
6932 }
6933
6934 /* now try B */
6935 ST.save_curlyx = cur_curlyx;
6936 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
6937 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6938 locinput);
a74ff37d 6939 /* NOTREACHED */
661d43c4 6940 NOT_REACHED; /* NOTREACHED */
c476f425
DM
6941
6942 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6943 cur_curlyx = ST.save_curlyx;
6944 REGCP_UNWIND(ST.lastcp);
92da3157 6945 regcppop(rex, &maxopenparen);
c476f425 6946
d02d6d97 6947 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
6948 /* Maximum greed exceeded */
6949 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6950 && ckWARN(WARN_REGEXP)
39819bd9 6951 && !reginfo->warned)
c476f425 6952 {
39819bd9 6953 reginfo->warned = TRUE;
c476f425 6954 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
6955 "Complex regular subexpression recursion "
6956 "limit (%d) exceeded",
c476f425 6957 REG_INFTY - 1);
a0d0e21e 6958 }
c476f425 6959 cur_curlyx->u.curlyx.count--;
3ab3c9b4 6960 CACHEsayNO;
a0d0e21e 6961 }
c476f425
DM
6962
6963 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6964 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6965 );
6966 /* Try grabbing another A and see if it helps. */
c476f425 6967 cur_curlyx->u.curlyx.lastloc = locinput;
92da3157
DM
6968 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6969 maxopenparen);
c476f425 6970 REGCP_SET(ST.lastcp);
d02d6d97 6971 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
6972 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6973 locinput);
a74ff37d 6974 /* NOTREACHED */
661d43c4 6975 NOT_REACHED; /* NOTREACHED */
40a82448
DM
6976
6977#undef ST
6978#define ST st->u.branch
6979
6980 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
6981 next = scan + ARG(scan);
6982 if (next == scan)
6983 next = NULL;
40a82448 6984 scan = NEXTOPER(scan);
924ba076 6985 /* FALLTHROUGH */
c277df42 6986
40a82448
DM
6987 case BRANCH: /* /(...|A|...)/ */
6988 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 6989 ST.lastparen = rex->lastparen;
f6033a9d 6990 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
6991 ST.next_branch = next;
6992 REGCP_SET(ST.cp);
02db2b7b 6993
40a82448 6994 /* Now go into the branch */
5d458dd8 6995 if (has_cutgroup) {
4d5016e5 6996 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 6997 } else {
4d5016e5 6998 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 6999 }
a74ff37d 7000 /* NOTREACHED */
661d43c4 7001 NOT_REACHED; /* NOTREACHED */
3c0563b9
DM
7002
7003 case CUTGROUP: /* /(*THEN)/ */
5d458dd8 7004 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
ad64d0ec 7005 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 7006 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
a74ff37d 7007 /* NOTREACHED */
661d43c4 7008 NOT_REACHED; /* NOTREACHED */
3c0563b9 7009
5d458dd8
YO
7010 case CUTGROUP_next_fail:
7011 do_cutgroup = 1;
7012 no_final = 1;
7013 if (st->u.mark.mark_name)
7014 sv_commit = st->u.mark.mark_name;
7015 sayNO;
a74ff37d 7016 /* NOTREACHED */
661d43c4 7017 NOT_REACHED; /* NOTREACHED */
3c0563b9 7018
5d458dd8
YO
7019 case BRANCH_next:
7020 sayYES;
a74ff37d 7021 /* NOTREACHED */
661d43c4 7022 NOT_REACHED; /* NOTREACHED */
3c0563b9 7023
40a82448 7024 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
7025 if (do_cutgroup) {
7026 do_cutgroup = 0;
7027 no_final = 0;
7028 }
40a82448 7029 REGCP_UNWIND(ST.cp);
a8d1f4b4 7030 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
7031 scan = ST.next_branch;
7032 /* no more branches? */
5d458dd8
YO
7033 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7034 DEBUG_EXECUTE_r({
7035 PerlIO_printf( Perl_debug_log,
7036 "%*s %sBRANCH failed...%s\n",
7037 REPORT_CODE_OFF+depth*2, "",
7038 PL_colors[4],
7039 PL_colors[5] );
7040 });
7041 sayNO_SILENT;
7042 }
40a82448 7043 continue; /* execute next BRANCH[J] op */
a74ff37d 7044 /* NOTREACHED */
40a82448 7045
3c0563b9 7046 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 7047 minmod = 1;
a0d0e21e 7048 break;
40a82448
DM
7049
7050#undef ST
7051#define ST st->u.curlym
7052
7053 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
7054
7055 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 7056 * only a single backtracking state, no matter how many matches
40a82448
DM
7057 * there are in {m,n}. It relies on the pattern being constant
7058 * length, with no parens to influence future backrefs
7059 */
7060
7061 ST.me = scan;
dc45a647 7062 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 7063
f6033a9d
DM
7064 ST.lastparen = rex->lastparen;
7065 ST.lastcloseparen = rex->lastcloseparen;
7066
40a82448
DM
7067 /* if paren positive, emulate an OPEN/CLOSE around A */
7068 if (ST.me->flags) {
3b6647e0 7069 U32 paren = ST.me->flags;
92da3157
DM
7070 if (paren > maxopenparen)
7071 maxopenparen = paren;
c277df42 7072 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 7073 }
40a82448
DM
7074 ST.A = scan;
7075 ST.B = next;
7076 ST.alen = 0;
7077 ST.count = 0;
24d3c4a9
DM
7078 ST.minmod = minmod;
7079 minmod = 0;
40a82448
DM
7080 ST.c1 = CHRTEST_UNINIT;
7081 REGCP_SET(ST.cp);
6407bf3b 7082
40a82448
DM
7083 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7084 goto curlym_do_B;
7085
7086 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 7087 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
a74ff37d 7088 /* NOTREACHED */
661d43c4 7089 NOT_REACHED; /* NOTREACHED */
5f80c4cf 7090
40a82448 7091 case CURLYM_A: /* we've just matched an A */
40a82448
DM
7092 ST.count++;
7093 /* after first match, determine A's length: u.curlym.alen */
7094 if (ST.count == 1) {
ba44c216 7095 if (reginfo->is_utf8_target) {
c07e9d7b
DM
7096 char *s = st->locinput;
7097 while (s < locinput) {
40a82448
DM
7098 ST.alen++;
7099 s += UTF8SKIP(s);
7100 }
7101 }
7102 else {
c07e9d7b 7103 ST.alen = locinput - st->locinput;
40a82448
DM
7104 }
7105 if (ST.alen == 0)
7106 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7107 }
0cadcf80
DM
7108 DEBUG_EXECUTE_r(
7109 PerlIO_printf(Perl_debug_log,
40a82448 7110 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5bc10b2c 7111 (int)(REPORT_CODE_OFF+(depth*2)), "",
40a82448 7112 (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
7113 );
7114
0a4db386 7115 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 7116 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
7117 goto fake_end;
7118
c966426a
DM
7119 {
7120 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7121 if ( max == REG_INFTY || ST.count < max )
7122 goto curlym_do_A; /* try to match another A */
7123 }
40a82448 7124 goto curlym_do_B; /* try to match B */
5f80c4cf 7125
40a82448
DM
7126 case CURLYM_A_fail: /* just failed to match an A */
7127 REGCP_UNWIND(ST.cp);
0a4db386
YO
7128
7129 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7130 || (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 7131 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
40a82448 7132 sayNO;
0cadcf80 7133
40a82448 7134 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
7135 if (ST.c1 == CHRTEST_UNINIT) {
7136 /* calculate c1 and c2 for possible match of 1st char
7137 * following curly */
7138 ST.c1 = ST.c2 = CHRTEST_VOID;
d20a21f4 7139 assert(ST.B);
40a82448
DM
7140 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7141 regnode *text_node = ST.B;
7142 if (! HAS_TEXT(text_node))
7143 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
7144 /* this used to be
7145
7146 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7147
7148 But the former is redundant in light of the latter.
7149
7150 if this changes back then the macro for
7151 IS_TEXT and friends need to change.
7152 */
c74f6de9 7153 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 7154 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 7155 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 7156 reginfo))
c74f6de9
KW
7157 {
7158 sayNO;
7159 }
c277df42 7160 }
c277df42 7161 }
40a82448
DM
7162 }
7163
7164 DEBUG_EXECUTE_r(
7165 PerlIO_printf(Perl_debug_log,
7166 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5bc10b2c 7167 (int)(REPORT_CODE_OFF+(depth*2)),
40a82448
DM
7168 "", (IV)ST.count)
7169 );
c74f6de9 7170 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
7171 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7172 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7173 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7174 {
7175 /* simulate B failing */
7176 DEBUG_OPTIMISE_r(
7177 PerlIO_printf(Perl_debug_log,
33daa3a5 7178 "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
79a2a0e8
KW
7179 (int)(REPORT_CODE_OFF+(depth*2)),"",
7180 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7181 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7182 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7183 );
7184 state_num = CURLYM_B_fail;
7185 goto reenter_switch;
7186 }
7187 }
7188 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
7189 /* simulate B failing */
7190 DEBUG_OPTIMISE_r(
7191 PerlIO_printf(Perl_debug_log,
33daa3a5 7192 "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
5400f398 7193 (int)(REPORT_CODE_OFF+(depth*2)),"",
79a2a0e8
KW
7194 (int) nextchr, ST.c1, ST.c2)
7195 );
5400f398
KW
7196 state_num = CURLYM_B_fail;
7197 goto reenter_switch;
7198 }
c74f6de9 7199 }
40a82448
DM
7200
7201 if (ST.me->flags) {
f6033a9d 7202 /* emulate CLOSE: mark current A as captured */
40a82448
DM
7203 I32 paren = ST.me->flags;
7204 if (ST.count) {
b93070ed 7205 rex->offs[paren].start
9d9163fb
DM
7206 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7207 rex->offs[paren].end = locinput - reginfo->strbeg;
f6033a9d
DM
7208 if ((U32)paren > rex->lastparen)
7209 rex->lastparen = paren;
7210 rex->lastcloseparen = paren;
c277df42 7211 }
40a82448 7212 else
b93070ed 7213 rex->offs[paren].end = -1;
0a4db386 7214 if (cur_eval && cur_eval->u.eval.close_paren &&
24b23f37 7215 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
0a4db386
YO
7216 {
7217 if (ST.count)
7218 goto fake_end;
7219 else
7220 sayNO;
7221 }
c277df42 7222 }
0a4db386 7223
4d5016e5 7224 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
a74ff37d 7225 /* NOTREACHED */
661d43c4 7226 NOT_REACHED; /* NOTREACHED */
40a82448
DM
7227
7228 case CURLYM_B_fail: /* just failed to match a B */
7229 REGCP_UNWIND(ST.cp);
a8d1f4b4 7230 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 7231 if (ST.minmod) {
84d2fa14
HS
7232 I32 max = ARG2(ST.me);
7233 if (max != REG_INFTY && ST.count == max)
40a82448
DM
7234 sayNO;
7235 goto curlym_do_A; /* try to match a further A */
7236 }
7237 /* backtrack one A */
7238 if (ST.count == ARG1(ST.me) /* min */)
7239 sayNO;
7240 ST.count--;
7016d6eb 7241 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
7242 goto curlym_do_B; /* try to match B */
7243
c255a977
DM
7244#undef ST
7245#define ST st->u.curly
40a82448 7246
c255a977
DM
7247#define CURLY_SETPAREN(paren, success) \
7248 if (paren) { \
7249 if (success) { \
9d9163fb
DM
7250 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7251 rex->offs[paren].end = locinput - reginfo->strbeg; \
f6033a9d
DM
7252 if (paren > rex->lastparen) \
7253 rex->lastparen = paren; \
b93070ed 7254 rex->lastcloseparen = paren; \
c255a977 7255 } \
f6033a9d 7256 else { \
b93070ed 7257 rex->offs[paren].end = -1; \
f6033a9d
DM
7258 rex->lastparen = ST.lastparen; \
7259 rex->lastcloseparen = ST.lastcloseparen; \
7260 } \
c255a977
DM
7261 }
7262
b40a2c17 7263 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
7264 ST.paren = 0;
7265 ST.min = 0;
7266 ST.max = REG_INFTY;
a0d0e21e
LW
7267 scan = NEXTOPER(scan);
7268 goto repeat;
3c0563b9 7269
b40a2c17 7270 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
7271 ST.paren = 0;
7272 ST.min = 1;
7273 ST.max = REG_INFTY;
c277df42 7274 scan = NEXTOPER(scan);
c255a977 7275 goto repeat;
3c0563b9 7276
b40a2c17 7277 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
7278 ST.paren = scan->flags; /* Which paren to set */
7279 ST.lastparen = rex->lastparen;
f6033a9d 7280 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
7281 if (ST.paren > maxopenparen)
7282 maxopenparen = ST.paren;
c255a977
DM
7283 ST.min = ARG1(scan); /* min to match */
7284 ST.max = ARG2(scan); /* max to match */
0a4db386 7285 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 7286 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
7287 ST.min=1;
7288 ST.max=1;
7289 }
c255a977
DM
7290 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7291 goto repeat;
3c0563b9 7292
b40a2c17 7293 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
7294 ST.paren = 0;
7295 ST.min = ARG1(scan); /* min to match */
7296 ST.max = ARG2(scan); /* max to match */
7297 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 7298 repeat:
a0d0e21e
LW
7299 /*
7300 * Lookahead to avoid useless match attempts
7301 * when we know what character comes next.
c255a977 7302 *
5f80c4cf
JP
7303 * Used to only do .*x and .*?x, but now it allows
7304 * for )'s, ('s and (?{ ... })'s to be in the way
7305 * of the quantifier and the EXACT-like node. -- japhy
7306 */
7307
eb5c1be8 7308 assert(ST.min <= ST.max);
3337dfe3
KW
7309 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7310 ST.c1 = ST.c2 = CHRTEST_VOID;
7311 }
7312 else {
5f80c4cf
JP
7313 regnode *text_node = next;
7314
3dab1dad
YO
7315 if (! HAS_TEXT(text_node))
7316 FIND_NEXT_IMPT(text_node);
5f80c4cf 7317
9e137952 7318 if (! HAS_TEXT(text_node))
c255a977 7319 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 7320 else {
ee9b8eae 7321 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 7322 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 7323 }
c74f6de9 7324 else {
ee9b8eae
YO
7325
7326 /* Currently we only get here when
7327
7328 PL_rekind[OP(text_node)] == EXACT
7329
7330 if this changes back then the macro for IS_TEXT and
7331 friends need to change. */
79a2a0e8 7332 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 7333 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 7334 reginfo))
c74f6de9
KW
7335 {
7336 sayNO;
7337 }
7338 }
1aa99e6b 7339 }
bbce6d69 7340 }
c255a977
DM
7341
7342 ST.A = scan;
7343 ST.B = next;
24d3c4a9 7344 if (minmod) {
eb72505d 7345 char *li = locinput;
24d3c4a9 7346 minmod = 0;
984e6dd1 7347 if (ST.min &&
f9176b44 7348 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
984e6dd1 7349 < ST.min)
4633a7c4 7350 sayNO;
7016d6eb 7351 SET_locinput(li);
c255a977 7352 ST.count = ST.min;
c255a977
DM
7353 REGCP_SET(ST.cp);
7354 if (ST.c1 == CHRTEST_VOID)
7355 goto curly_try_B_min;
7356
7357 ST.oldloc = locinput;
7358
7359 /* set ST.maxpos to the furthest point along the
7360 * string that could possibly match */
7361 if (ST.max == REG_INFTY) {
220db18a 7362 ST.maxpos = reginfo->strend - 1;
f2ed9b32 7363 if (utf8_target)
c255a977
DM
7364 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7365 ST.maxpos--;
7366 }
f2ed9b32 7367 else if (utf8_target) {
c255a977
DM
7368 int m = ST.max - ST.min;
7369 for (ST.maxpos = locinput;
220db18a 7370 m >0 && ST.maxpos < reginfo->strend; m--)
c255a977
DM
7371 ST.maxpos += UTF8SKIP(ST.maxpos);
7372 }
7373 else {
7374 ST.maxpos = locinput + ST.max - ST.min;
220db18a
DM
7375 if (ST.maxpos >= reginfo->strend)
7376 ST.maxpos = reginfo->strend - 1;
c255a977
DM
7377 }
7378 goto curly_try_B_min_known;
7379
7380 }
7381 else {
eb72505d
DM
7382 /* avoid taking address of locinput, so it can remain
7383 * a register var */
7384 char *li = locinput;
f9176b44 7385 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
c255a977
DM
7386 if (ST.count < ST.min)
7387 sayNO;
7016d6eb 7388 SET_locinput(li);
c255a977
DM
7389 if ((ST.count > ST.min)
7390 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
7391 {
7392 /* A{m,n} must come at the end of the string, there's
7393 * no point in backing off ... */
7394 ST.min = ST.count;
7395 /* ...except that $ and \Z can match before *and* after
7396 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
7397 We may back off by one in this case. */
eb72505d 7398 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
7399 ST.min--;
7400 }
7401 REGCP_SET(ST.cp);
7402 goto curly_try_B_max;
7403 }
a74ff37d 7404 /* NOTREACHED */
661d43c4 7405 NOT_REACHED; /* NOTREACHED */
c255a977
DM
7406
7407 case CURLY_B_min_known_fail:
7408 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 7409
c255a977 7410 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
7411 if (ST.paren) {
7412 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7413 }
c255a977
DM
7414 /* Couldn't or didn't -- move forward. */
7415 ST.oldloc = locinput;
f2ed9b32 7416 if (utf8_target)
c255a977
DM
7417 locinput += UTF8SKIP(locinput);
7418 else
7419 locinput++;
7420 ST.count++;
7421 curly_try_B_min_known:
7422 /* find the next place where 'B' could work, then call B */
7423 {
7424 int n;
f2ed9b32 7425 if (utf8_target) {
c255a977
DM
7426 n = (ST.oldloc == locinput) ? 0 : 1;
7427 if (ST.c1 == ST.c2) {
c255a977 7428 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
7429 while (locinput <= ST.maxpos
7430 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
7431 {
7432 locinput += UTF8SKIP(locinput);
c255a977
DM
7433 n++;
7434 }
1aa99e6b
IH
7435 }
7436 else {
c255a977 7437 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
7438 while (locinput <= ST.maxpos
7439 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7440 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7441 {
7442 locinput += UTF8SKIP(locinput);
c255a977 7443 n++;
1aa99e6b 7444 }
0fe9bf95
IZ
7445 }
7446 }
5400f398 7447 else { /* Not utf8_target */
c255a977
DM
7448 if (ST.c1 == ST.c2) {
7449 while (locinput <= ST.maxpos &&
7450 UCHARAT(locinput) != ST.c1)
7451 locinput++;
bbce6d69 7452 }
c255a977
DM
7453 else {
7454 while (locinput <= ST.maxpos
7455 && UCHARAT(locinput) != ST.c1
7456 && UCHARAT(locinput) != ST.c2)
7457 locinput++;
a0ed51b3 7458 }
c255a977
DM
7459 n = locinput - ST.oldloc;
7460 }
7461 if (locinput > ST.maxpos)
7462 sayNO;
c255a977 7463 if (n) {
eb72505d
DM
7464 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
7465 * at b; check that everything between oldloc and
7466 * locinput matches */
7467 char *li = ST.oldloc;
c255a977 7468 ST.count += n;
f9176b44 7469 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
4633a7c4 7470 sayNO;
eb72505d 7471 assert(n == REG_INFTY || locinput == li);
a0d0e21e 7472 }
c255a977 7473 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 7474 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 7475 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
7476 goto fake_end;
7477 }
4d5016e5 7478 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 7479 }
a74ff37d 7480 /* NOTREACHED */
661d43c4 7481 NOT_REACHED; /* NOTREACHED */
c255a977
DM
7482
7483 case CURLY_B_min_fail:
7484 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
7485
7486 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
7487 if (ST.paren) {
7488 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7489 }
c255a977 7490 /* failed -- move forward one */
f73aaa43 7491 {
eb72505d 7492 char *li = locinput;
f9176b44 7493 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
f73aaa43
DM
7494 sayNO;
7495 }
eb72505d 7496 locinput = li;
f73aaa43
DM
7497 }
7498 {
c255a977 7499 ST.count++;
c255a977
DM
7500 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
7501 ST.count > 0)) /* count overflow ? */
15272685 7502 {
c255a977
DM
7503 curly_try_B_min:
7504 CURLY_SETPAREN(ST.paren, ST.count);
0a4db386 7505 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 7506 cur_eval->u.eval.close_paren == (U32)ST.paren) {
0a4db386
YO
7507 goto fake_end;
7508 }
4d5016e5 7509 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
7510 }
7511 }
c74f6de9 7512 sayNO;
a74ff37d 7513 /* NOTREACHED */
661d43c4 7514 NOT_REACHED; /* NOTREACHED */
c255a977 7515
c52b8b12 7516 curly_try_B_max:
c255a977 7517 /* a successful greedy match: now try to match B */
40d049e4 7518 if (cur_eval && cur_eval->u.eval.close_paren &&
86413ec0 7519 cur_eval->u.eval.close_paren == (U32)ST.paren) {
40d049e4
YO
7520 goto fake_end;
7521 }
c255a977 7522 {
220db18a 7523 bool could_match = locinput < reginfo->strend;
79a2a0e8 7524
c255a977 7525 /* If it could work, try it. */
79a2a0e8
KW
7526 if (ST.c1 != CHRTEST_VOID && could_match) {
7527 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
7528 {
7529 could_match = memEQ(locinput,
7530 ST.c1_utf8,
7531 UTF8SKIP(locinput))
7532 || memEQ(locinput,
7533 ST.c2_utf8,
7534 UTF8SKIP(locinput));
7535 }
7536 else {
7537 could_match = UCHARAT(locinput) == ST.c1
7538 || UCHARAT(locinput) == ST.c2;
7539 }
7540 }
7541 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 7542 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 7543 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
a74ff37d 7544 /* NOTREACHED */
661d43c4 7545 NOT_REACHED; /* NOTREACHED */
c255a977
DM
7546 }
7547 }
924ba076 7548 /* FALLTHROUGH */
3c0563b9 7549
c255a977
DM
7550 case CURLY_B_max_fail:
7551 /* failed to find B in a greedy match */
c255a977
DM
7552
7553 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
7554 if (ST.paren) {
7555 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7556 }
c255a977
DM
7557 /* back up. */
7558 if (--ST.count < ST.min)
7559 sayNO;
eb72505d 7560 locinput = HOPc(locinput, -1);
c255a977
DM
7561 goto curly_try_B_max;
7562
7563#undef ST
7564
3c0563b9 7565 case END: /* last op of main pattern */
c52b8b12 7566 fake_end:
faec1544
DM
7567 if (cur_eval) {
7568 /* we've just finished A in /(??{A})B/; now continue with B */
faec1544 7569
288b8c02 7570 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
7571
7572 /* Save *all* the positions. */
7573 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
ec43f78b 7574 rex_sv = cur_eval->u.eval.prev_rex;
aed7b151 7575 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 7576 SET_reg_curpm(rex_sv);
8d919b0a 7577 rex = ReANY(rex_sv);
f8fc2ecf 7578 rexi = RXi_GET(rex);
faec1544 7579 cur_curlyx = cur_eval->u.eval.prev_curlyx;
34a81e2b 7580
faec1544 7581 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
7582
7583 /* Restore parens of the outer rex without popping the
7584 * savestack */
92da3157
DM
7585 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
7586 &maxopenparen);
faec1544
DM
7587
7588 st->u.eval.prev_eval = cur_eval;
7589 cur_eval = cur_eval->u.eval.prev_eval;
7590 DEBUG_EXECUTE_r(
2a49f0f5
JH
7591 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
7592 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
e7707071
YO
7593 if ( nochange_depth )
7594 nochange_depth--;
7595
4d5016e5
DM
7596 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
7597 locinput); /* match B */
faec1544
DM
7598 }
7599
3b0527fe 7600 if (locinput < reginfo->till) {
a3621e74 7601 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7821416a
IZ
7602 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
7603 PL_colors[4],
6d59b646
DM
7604 (long)(locinput - startpos),
7605 (long)(reginfo->till - startpos),
7821416a 7606 PL_colors[5]));
58e23c8d 7607
262b90c4 7608 sayNO_SILENT; /* Cannot match: too short. */
7821416a 7609 }
262b90c4 7610 sayYES; /* Success! */
dad79028
DM
7611
7612 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
7613 DEBUG_EXECUTE_r(
7614 PerlIO_printf(Perl_debug_log,
7615 "%*s %ssubpattern success...%s\n",
5bc10b2c 7616 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
262b90c4 7617 sayYES; /* Success! */
dad79028 7618
40a82448
DM
7619#undef ST
7620#define ST st->u.ifmatch
7621
37f53970
DM
7622 {
7623 char *newstart;
7624
40a82448
DM
7625 case SUSPEND: /* (?>A) */
7626 ST.wanted = 1;
37f53970 7627 newstart = locinput;
9041c2e3 7628 goto do_ifmatch;
dad79028 7629
40a82448
DM
7630 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
7631 ST.wanted = 0;
dad79028
DM
7632 goto ifmatch_trivial_fail_test;
7633
40a82448
DM
7634 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
7635 ST.wanted = 1;
dad79028 7636 ifmatch_trivial_fail_test:
a0ed51b3 7637 if (scan->flags) {
52657f30 7638 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
7639 if (!s) {
7640 /* trivial fail */
24d3c4a9
DM
7641 if (logical) {
7642 logical = 0;
f2338a2e 7643 sw = 1 - cBOOL(ST.wanted);
dad79028 7644 }
40a82448 7645 else if (ST.wanted)
dad79028
DM
7646 sayNO;
7647 next = scan + ARG(scan);
7648 if (next == scan)
7649 next = NULL;
7650 break;
7651 }
37f53970 7652 newstart = s;
a0ed51b3
LW
7653 }
7654 else
37f53970 7655 newstart = locinput;
a0ed51b3 7656
c277df42 7657 do_ifmatch:
40a82448 7658 ST.me = scan;
24d3c4a9 7659 ST.logical = logical;
24d786f4
YO
7660 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
7661
40a82448 7662 /* execute body of (?...A) */
37f53970 7663 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
a74ff37d 7664 /* NOTREACHED */
661d43c4 7665 NOT_REACHED; /* NOTREACHED */
37f53970 7666 }
40a82448
DM
7667
7668 case IFMATCH_A_fail: /* body of (?...A) failed */
7669 ST.wanted = !ST.wanted;
924ba076 7670 /* FALLTHROUGH */
40a82448
DM
7671
7672 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 7673 if (ST.logical) {
f2338a2e 7674 sw = cBOOL(ST.wanted);
40a82448
DM
7675 }
7676 else if (!ST.wanted)
7677 sayNO;
7678
37f53970
DM
7679 if (OP(ST.me) != SUSPEND) {
7680 /* restore old position except for (?>...) */
7681 locinput = st->locinput;
40a82448
DM
7682 }
7683 scan = ST.me + ARG(ST.me);
7684 if (scan == ST.me)
7685 scan = NULL;
7686 continue; /* execute B */
7687
7688#undef ST
dad79028 7689
3c0563b9
DM
7690 case LONGJMP: /* alternative with many branches compiles to
7691 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
7692 next = scan + ARG(scan);
7693 if (next == scan)
7694 next = NULL;
a0d0e21e 7695 break;
3c0563b9
DM
7696
7697 case COMMIT: /* (*COMMIT) */
220db18a 7698 reginfo->cutpoint = reginfo->strend;
e2e6a0f1 7699 /* FALLTHROUGH */
3c0563b9
DM
7700
7701 case PRUNE: /* (*PRUNE) */
e2e6a0f1 7702 if (!scan->flags)
ad64d0ec 7703 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 7704 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
a74ff37d 7705 /* NOTREACHED */
661d43c4 7706 NOT_REACHED; /* NOTREACHED */
3c0563b9 7707
54612592
YO
7708 case COMMIT_next_fail:
7709 no_final = 1;
7710 /* FALLTHROUGH */
3c0563b9
DM
7711
7712 case OPFAIL: /* (*FAIL) */
7f69552c 7713 sayNO;
a74ff37d 7714 /* NOTREACHED */
661d43c4 7715 NOT_REACHED; /* NOTREACHED */
e2e6a0f1
YO
7716
7717#define ST st->u.mark
3c0563b9 7718 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 7719 ST.prev_mark = mark_state;
5d458dd8 7720 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 7721 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 7722 mark_state = st;
4d5016e5
DM
7723 ST.mark_loc = locinput;
7724 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
a74ff37d 7725 /* NOTREACHED */
661d43c4 7726 NOT_REACHED; /* NOTREACHED */
3c0563b9 7727
e2e6a0f1
YO
7728 case MARKPOINT_next:
7729 mark_state = ST.prev_mark;
7730 sayYES;
a74ff37d 7731 /* NOTREACHED */
661d43c4 7732 NOT_REACHED; /* NOTREACHED */
3c0563b9 7733
e2e6a0f1 7734 case MARKPOINT_next_fail:
5d458dd8 7735 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
7736 {
7737 if (ST.mark_loc > startpoint)
7738 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7739 popmark = NULL; /* we found our mark */
7740 sv_commit = ST.mark_name;
7741
7742 DEBUG_EXECUTE_r({
5d458dd8 7743 PerlIO_printf(Perl_debug_log,
e2e6a0f1
YO
7744 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
7745 REPORT_CODE_OFF+depth*2, "",
be2597df 7746 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
7747 });
7748 }
7749 mark_state = ST.prev_mark;
5d458dd8
YO
7750 sv_yes_mark = mark_state ?
7751 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 7752 sayNO;
a74ff37d 7753 /* NOTREACHED */
661d43c4 7754 NOT_REACHED; /* NOTREACHED */
3c0563b9
DM
7755
7756 case SKIP: /* (*SKIP) */
5d458dd8 7757 if (scan->flags) {
2bf803e2 7758 /* (*SKIP) : if we fail we cut here*/
5d458dd8 7759 ST.mark_name = NULL;
e2e6a0f1 7760 ST.mark_loc = locinput;
4d5016e5 7761 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 7762 } else {
2bf803e2 7763 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
7764 otherwise do nothing. Meaning we need to scan
7765 */
7766 regmatch_state *cur = mark_state;
ad64d0ec 7767 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
7768
7769 while (cur) {
7770 if ( sv_eq( cur->u.mark.mark_name,
7771 find ) )
7772 {
7773 ST.mark_name = find;
4d5016e5 7774 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
7775 }
7776 cur = cur->u.mark.prev_mark;
7777 }
e2e6a0f1 7778 }
2bf803e2 7779 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 7780 break;
3c0563b9 7781
5d458dd8
YO
7782 case SKIP_next_fail:
7783 if (ST.mark_name) {
7784 /* (*CUT:NAME) - Set up to search for the name as we
7785 collapse the stack*/
7786 popmark = ST.mark_name;
7787 } else {
7788 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
7789 if (ST.mark_loc > startpoint)
7790 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
7791 /* but we set sv_commit to latest mark_name if there
7792 is one so they can test to see how things lead to this
7793 cut */
7794 if (mark_state)
7795 sv_commit=mark_state->u.mark.mark_name;
7796 }
e2e6a0f1
YO
7797 no_final = 1;
7798 sayNO;
a74ff37d 7799 /* NOTREACHED */
661d43c4 7800 NOT_REACHED; /* NOTREACHED */
e2e6a0f1 7801#undef ST
3c0563b9
DM
7802
7803 case LNBREAK: /* \R */
220db18a 7804 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
e1d1eefb 7805 locinput += n;
e1d1eefb
YO
7806 } else
7807 sayNO;
7808 break;
7809
a0d0e21e 7810 default:
b900a521 7811 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
d7d93a81 7812 PTR2UV(scan), OP(scan));
cea2e8a9 7813 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
7814
7815 /* this is a point to jump to in order to increment
7816 * locinput by one character */
c52b8b12 7817 increment_locinput:
e6ca698c 7818 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
7819 if (utf8_target) {
7820 locinput += PL_utf8skip[nextchr];
7016d6eb 7821 /* locinput is allowed to go 1 char off the end, but not 2+ */
220db18a 7822 if (locinput > reginfo->strend)
28b98f76 7823 sayNO;
28b98f76
DM
7824 }
7825 else
3640db6b 7826 locinput++;
28b98f76 7827 break;
5d458dd8
YO
7828
7829 } /* end switch */
95b24440 7830
5d458dd8
YO
7831 /* switch break jumps here */
7832 scan = next; /* prepare to execute the next op and ... */
7833 continue; /* ... jump back to the top, reusing st */
a74ff37d 7834 /* NOTREACHED */
95b24440 7835
40a82448
DM
7836 push_yes_state:
7837 /* push a state that backtracks on success */
7838 st->u.yes.prev_yes_state = yes_state;
7839 yes_state = st;
924ba076 7840 /* FALLTHROUGH */
40a82448
DM
7841 push_state:
7842 /* push a new regex state, then continue at scan */
7843 {
7844 regmatch_state *newst;
7845
24b23f37
YO
7846 DEBUG_STACK_r({
7847 regmatch_state *cur = st;
7848 regmatch_state *curyes = yes_state;
7849 int curd = depth;
7850 regmatch_slab *slab = PL_regmatch_slab;
7851 for (;curd > -1;cur--,curd--) {
7852 if (cur < SLAB_FIRST(slab)) {
7853 slab = slab->prev;
7854 cur = SLAB_LAST(slab);
7855 }
7856 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
7857 REPORT_CODE_OFF + 2 + depth * 2,"",
13d6edb4 7858 curd, PL_reg_name[cur->resume_state],
24b23f37
YO
7859 (curyes == cur) ? "yes" : ""
7860 );
7861 if (curyes == cur)
7862 curyes = cur->u.yes.prev_yes_state;
7863 }
7864 } else
7865 DEBUG_STATE_pp("push")
7866 );
40a82448 7867 depth++;
40a82448
DM
7868 st->locinput = locinput;
7869 newst = st+1;
7870 if (newst > SLAB_LAST(PL_regmatch_slab))
7871 newst = S_push_slab(aTHX);
7872 PL_regmatch_state = newst;
786e8c11 7873
4d5016e5 7874 locinput = pushinput;
40a82448
DM
7875 st = newst;
7876 continue;
a74ff37d 7877 /* NOTREACHED */
40a82448 7878 }
a0d0e21e 7879 }
a687059c 7880
a0d0e21e
LW
7881 /*
7882 * We get here only if there's trouble -- normally "case END" is
7883 * the terminating point.
7884 */
cea2e8a9 7885 Perl_croak(aTHX_ "corrupted regexp pointers");
a74ff37d 7886 /* NOTREACHED */
4633a7c4 7887 sayNO;
661d43c4 7888 NOT_REACHED; /* NOTREACHED */
4633a7c4 7889
7b52d656 7890 yes:
77cb431f
DM
7891 if (yes_state) {
7892 /* we have successfully completed a subexpression, but we must now
7893 * pop to the state marked by yes_state and continue from there */
77cb431f 7894 assert(st != yes_state);
5bc10b2c
DM
7895#ifdef DEBUGGING
7896 while (st != yes_state) {
7897 st--;
7898 if (st < SLAB_FIRST(PL_regmatch_slab)) {
7899 PL_regmatch_slab = PL_regmatch_slab->prev;
7900 st = SLAB_LAST(PL_regmatch_slab);
7901 }
e2e6a0f1 7902 DEBUG_STATE_r({
54612592
YO
7903 if (no_final) {
7904 DEBUG_STATE_pp("pop (no final)");
7905 } else {
7906 DEBUG_STATE_pp("pop (yes)");
7907 }
e2e6a0f1 7908 });
5bc10b2c
DM
7909 depth--;
7910 }
7911#else
77cb431f
DM
7912 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
7913 || yes_state > SLAB_LAST(PL_regmatch_slab))
7914 {
7915 /* not in this slab, pop slab */
7916 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
7917 PL_regmatch_slab = PL_regmatch_slab->prev;
7918 st = SLAB_LAST(PL_regmatch_slab);
7919 }
7920 depth -= (st - yes_state);
5bc10b2c 7921#endif
77cb431f
DM
7922 st = yes_state;
7923 yes_state = st->u.yes.prev_yes_state;
7924 PL_regmatch_state = st;
24b23f37 7925
3640db6b 7926 if (no_final)
5d458dd8 7927 locinput= st->locinput;
54612592 7928 state_num = st->resume_state + no_final;
24d3c4a9 7929 goto reenter_switch;
77cb431f
DM
7930 }
7931
a3621e74 7932 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
e4584336 7933 PL_colors[4], PL_colors[5]));
02db2b7b 7934
bf2039a9 7935 if (reginfo->info_aux_eval) {
19b95bf0
DM
7936 /* each successfully executed (?{...}) block does the equivalent of
7937 * local $^R = do {...}
7938 * When popping the save stack, all these locals would be undone;
7939 * bypass this by setting the outermost saved $^R to the latest
7940 * value */
4b22688e
YO
7941 /* I dont know if this is needed or works properly now.
7942 * see code related to PL_replgv elsewhere in this file.
7943 * Yves
7944 */
19b95bf0
DM
7945 if (oreplsv != GvSV(PL_replgv))
7946 sv_setsv(oreplsv, GvSV(PL_replgv));
7947 }
95b24440 7948 result = 1;
aa283a38 7949 goto final_exit;
4633a7c4 7950
7b52d656 7951 no:
a3621e74 7952 DEBUG_EXECUTE_r(
7821416a 7953 PerlIO_printf(Perl_debug_log,
786e8c11 7954 "%*s %sfailed...%s\n",
5bc10b2c 7955 REPORT_CODE_OFF+depth*2, "",
786e8c11 7956 PL_colors[4], PL_colors[5])
7821416a 7957 );
aa283a38 7958
7b52d656 7959 no_silent:
54612592
YO
7960 if (no_final) {
7961 if (yes_state) {
7962 goto yes;
7963 } else {
7964 goto final_exit;
7965 }
7966 }
aa283a38
DM
7967 if (depth) {
7968 /* there's a previous state to backtrack to */
40a82448
DM
7969 st--;
7970 if (st < SLAB_FIRST(PL_regmatch_slab)) {
7971 PL_regmatch_slab = PL_regmatch_slab->prev;
7972 st = SLAB_LAST(PL_regmatch_slab);
7973 }
7974 PL_regmatch_state = st;
40a82448 7975 locinput= st->locinput;
40a82448 7976
5bc10b2c
DM
7977 DEBUG_STATE_pp("pop");
7978 depth--;
262b90c4
DM
7979 if (yes_state == st)
7980 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 7981
24d3c4a9
DM
7982 state_num = st->resume_state + 1; /* failure = success + 1 */
7983 goto reenter_switch;
95b24440 7984 }
24d3c4a9 7985 result = 0;
aa283a38 7986
262b90c4 7987 final_exit:
bbe252da 7988 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
7989 SV *sv_err = get_sv("REGERROR", 1);
7990 SV *sv_mrk = get_sv("REGMARK", 1);
7991 if (result) {
e2e6a0f1 7992 sv_commit = &PL_sv_no;
5d458dd8
YO
7993 if (!sv_yes_mark)
7994 sv_yes_mark = &PL_sv_yes;
7995 } else {
7996 if (!sv_commit)
7997 sv_commit = &PL_sv_yes;
7998 sv_yes_mark = &PL_sv_no;
7999 }
316ebaf2
JH
8000 assert(sv_err);
8001 assert(sv_mrk);
5d458dd8
YO
8002 sv_setsv(sv_err, sv_commit);
8003 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 8004 }
19b95bf0 8005
81ed78b2
DM
8006
8007 if (last_pushed_cv) {
8008 dSP;
8009 POP_MULTICALL;
4f8dbb2d 8010 PERL_UNUSED_VAR(SP);
81ed78b2
DM
8011 }
8012
9d9163fb
DM
8013 assert(!result || locinput - reginfo->strbeg >= 0);
8014 return result ? locinput - reginfo->strbeg : -1;
a687059c
LW
8015}
8016
8017/*
8018 - regrepeat - repeatedly match something simple, report how many
d60de1d1 8019 *
e64f369d
KW
8020 * What 'simple' means is a node which can be the operand of a quantifier like
8021 * '+', or {1,3}
8022 *
d60de1d1
DM
8023 * startposp - pointer a pointer to the start position. This is updated
8024 * to point to the byte following the highest successful
8025 * match.
8026 * p - the regnode to be repeatedly matched against.
220db18a 8027 * reginfo - struct holding match state, such as strend
4063ade8 8028 * max - maximum number of things to match.
d60de1d1 8029 * depth - (for debugging) backtracking depth.
a687059c 8030 */
76e3520e 8031STATIC I32
272d35c9 8032S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
f9176b44 8033 regmatch_info *const reginfo, I32 max, int depth)
a687059c 8034{
4063ade8 8035 char *scan; /* Pointer to current position in target string */
eb578fdb 8036 I32 c;
220db18a 8037 char *loceol = reginfo->strend; /* local version */
4063ade8 8038 I32 hardcount = 0; /* How many matches so far */
ba44c216 8039 bool utf8_target = reginfo->is_utf8_target;
b53eee5d 8040 unsigned int to_complement = 0; /* Invert the result? */
d513472c 8041 UV utf8_flags;
3018b823 8042 _char_class_number classnum;
4f55667c
SP
8043#ifndef DEBUGGING
8044 PERL_UNUSED_ARG(depth);
8045#endif
a0d0e21e 8046
7918f24d
NC
8047 PERL_ARGS_ASSERT_REGREPEAT;
8048
f73aaa43 8049 scan = *startposp;
faf11cac
HS
8050 if (max == REG_INFTY)
8051 max = I32_MAX;
dfb8f192 8052 else if (! utf8_target && loceol - scan > max)
7f596f4c 8053 loceol = scan + max;
4063ade8
KW
8054
8055 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8056 * to the maximum of how far we should go in it (leaving it set to the real
8057 * end, if the maximum permissible would take us beyond that). This allows
8058 * us to make the loop exit condition that we haven't gone past <loceol> to
8059 * also mean that we haven't exceeded the max permissible count, saving a
8060 * test each time through the loop. But it assumes that the OP matches a
8061 * single byte, which is true for most of the OPs below when applied to a
8062 * non-UTF-8 target. Those relatively few OPs that don't have this
8063 * characteristic will have to compensate.
8064 *
8065 * There is no adjustment for UTF-8 targets, as the number of bytes per
8066 * character varies. OPs will have to test both that the count is less
8067 * than the max permissible (using <hardcount> to keep track), and that we
8068 * are still within the bounds of the string (using <loceol>. A few OPs
8069 * match a single byte no matter what the encoding. They can omit the max
8070 * test if, for the UTF-8 case, they do the adjustment that was skipped
8071 * above.
8072 *
8073 * Thus, the code above sets things up for the common case; and exceptional
8074 * cases need extra work; the common case is to make sure <scan> doesn't
8075 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8076 * count doesn't exceed the maximum permissible */
8077
a0d0e21e 8078 switch (OP(p)) {
22c35a8c 8079 case REG_ANY:
f2ed9b32 8080 if (utf8_target) {
1aa99e6b 8081 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
8082 scan += UTF8SKIP(scan);
8083 hardcount++;
8084 }
8085 } else {
8086 while (scan < loceol && *scan != '\n')
8087 scan++;
a0ed51b3
LW
8088 }
8089 break;
ffc61ed2 8090 case SANY:
f2ed9b32 8091 if (utf8_target) {
a0804c9e 8092 while (scan < loceol && hardcount < max) {
def8e4ea
JH
8093 scan += UTF8SKIP(scan);
8094 hardcount++;
8095 }
8096 }
8097 else
8098 scan = loceol;
a0ed51b3 8099 break;
a4525e78 8100 case EXACTL:
780fcc9f 8101 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
613abc6d
KW
8102 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8103 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8104 }
780fcc9f 8105 /* FALLTHROUGH */
59d32103 8106 case EXACT:
f9176b44 8107 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 8108
59d32103 8109 c = (U8)*STRING(p);
59d32103 8110
5e4a1da1
KW
8111 /* Can use a simple loop if the pattern char to match on is invariant
8112 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
8113 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8114 * true iff it doesn't matter if the argument is in UTF-8 or not */
f9176b44 8115 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
e9369824 8116 if (utf8_target && loceol - scan > max) {
4063ade8
KW
8117 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8118 * since here, to match at all, 1 char == 1 byte */
8119 loceol = scan + max;
8120 }
59d32103
KW
8121 while (scan < loceol && UCHARAT(scan) == c) {
8122 scan++;
8123 }
8124 }
f9176b44 8125 else if (reginfo->is_utf8_pat) {
5e4a1da1
KW
8126 if (utf8_target) {
8127 STRLEN scan_char_len;
5e4a1da1 8128
4063ade8 8129 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
8130 * string EQ */
8131 while (hardcount < max
9a902117
KW
8132 && scan < loceol
8133 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
8134 && memEQ(scan, STRING(p), scan_char_len))
8135 {
4200a00c 8136 scan += scan_char_len;
5e4a1da1
KW
8137 hardcount++;
8138 }
8139 }
8140 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 8141
5e4a1da1
KW
8142 /* Target isn't utf8; convert the character in the UTF-8
8143 * pattern to non-UTF8, and do a simple loop */
94bb8c36 8144 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
5e4a1da1
KW
8145 while (scan < loceol && UCHARAT(scan) == c) {
8146 scan++;
8147 }
8148 } /* else pattern char is above Latin1, can't possibly match the
8149 non-UTF-8 target */
b40a2c17 8150 }
5e4a1da1 8151 else {
59d32103 8152
5e4a1da1
KW
8153 /* Here, the string must be utf8; pattern isn't, and <c> is
8154 * different in utf8 than not, so can't compare them directly.
8155 * Outside the loop, find the two utf8 bytes that represent c, and
8156 * then look for those in sequence in the utf8 string */
59d32103
KW
8157 U8 high = UTF8_TWO_BYTE_HI(c);
8158 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
8159
8160 while (hardcount < max
8161 && scan + 1 < loceol
8162 && UCHARAT(scan) == high
8163 && UCHARAT(scan + 1) == low)
8164 {
8165 scan += 2;
8166 hardcount++;
8167 }
8168 }
8169 break;
5e4a1da1 8170
098b07d5
KW
8171 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
8172 assert(! reginfo->is_utf8_pat);
924ba076 8173 /* FALLTHROUGH */
2f7f8cb1 8174 case EXACTFA:
098b07d5 8175 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
8176 goto do_exactf;
8177
d4e0b827 8178 case EXACTFL:
780fcc9f 8179 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
cea315b6 8180 utf8_flags = FOLDEQ_LOCALE;
17580e7a
KW
8181 goto do_exactf;
8182
2fdb7295
KW
8183 case EXACTF: /* This node only generated for non-utf8 patterns */
8184 assert(! reginfo->is_utf8_pat);
098b07d5
KW
8185 utf8_flags = 0;
8186 goto do_exactf;
62bf7766 8187
a4525e78
KW
8188 case EXACTFLU8:
8189 if (! utf8_target) {
8190 break;
8191 }
613abc6d
KW
8192 utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8193 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
8194 goto do_exactf;
8195
3c760661 8196 case EXACTFU_SS:
9a5a5549 8197 case EXACTFU:
f9176b44 8198 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 8199
c52b8b12 8200 do_exactf: {
613a425d
KW
8201 int c1, c2;
8202 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 8203
f9176b44 8204 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 8205
984e6dd1 8206 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
aed7b151 8207 reginfo))
984e6dd1 8208 {
613a425d 8209 if (c1 == CHRTEST_VOID) {
49b95fad 8210 /* Use full Unicode fold matching */
220db18a 8211 char *tmpeol = reginfo->strend;
f9176b44 8212 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
8213 while (hardcount < max
8214 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8215 STRING(p), NULL, pat_len,
f9176b44 8216 reginfo->is_utf8_pat, utf8_flags))
49b95fad
KW
8217 {
8218 scan = tmpeol;
220db18a 8219 tmpeol = reginfo->strend;
49b95fad
KW
8220 hardcount++;
8221 }
613a425d
KW
8222 }
8223 else if (utf8_target) {
8224 if (c1 == c2) {
4063ade8
KW
8225 while (scan < loceol
8226 && hardcount < max
613a425d
KW
8227 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8228 {
8229 scan += UTF8SKIP(scan);
8230 hardcount++;
8231 }
8232 }
8233 else {
4063ade8
KW
8234 while (scan < loceol
8235 && hardcount < max
613a425d
KW
8236 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8237 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8238 {
8239 scan += UTF8SKIP(scan);
8240 hardcount++;
8241 }
8242 }
8243 }
8244 else if (c1 == c2) {
8245 while (scan < loceol && UCHARAT(scan) == c1) {
8246 scan++;
8247 }
8248 }
8249 else {
8250 while (scan < loceol &&
8251 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8252 {
8253 scan++;
8254 }
8255 }
634c83a2 8256 }
bbce6d69 8257 break;
613a425d 8258 }
a4525e78 8259 case ANYOFL:
780fcc9f 8260 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30
KW
8261
8262 if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
8263 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8264 }
780fcc9f 8265 /* FALLTHROUGH */
ac44c12e 8266 case ANYOFD:
a0d0e21e 8267 case ANYOF:
e0193e47 8268 if (utf8_target) {
4e8910e0 8269 while (hardcount < max
9a902117 8270 && scan < loceol
3db24e1e 8271 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
4e8910e0 8272 {
9a902117 8273 scan += UTF8SKIP(scan);
ffc61ed2
JH
8274 hardcount++;
8275 }
8276 } else {
32fc9b6a 8277 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
ffc61ed2
JH
8278 scan++;
8279 }
a0d0e21e 8280 break;
4063ade8 8281
3018b823 8282 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 8283
3018b823
KW
8284 case NPOSIXL:
8285 to_complement = 1;
8286 /* FALLTHROUGH */
980866de 8287
3018b823 8288 case POSIXL:
780fcc9f 8289 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
8290 if (! utf8_target) {
8291 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8292 *scan)))
a12cf05f 8293 {
3018b823
KW
8294 scan++;
8295 }
8296 } else {
8297 while (hardcount < max && scan < loceol
8298 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8299 (U8 *) scan)))
8300 {
8301 scan += UTF8SKIP(scan);
ffc61ed2
JH
8302 hardcount++;
8303 }
a0ed51b3
LW
8304 }
8305 break;
0658cdde 8306
3018b823
KW
8307 case POSIXD:
8308 if (utf8_target) {
8309 goto utf8_posix;
8310 }
8311 /* FALLTHROUGH */
8312
0658cdde 8313 case POSIXA:
0430522f 8314 if (utf8_target && loceol - scan > max) {
4063ade8 8315
7aee35ff
KW
8316 /* We didn't adjust <loceol> at the beginning of this routine
8317 * because is UTF-8, but it is actually ok to do so, since here, to
8318 * match, 1 char == 1 byte. */
4063ade8
KW
8319 loceol = scan + max;
8320 }
8321 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
8322 scan++;
8323 }
8324 break;
980866de 8325
3018b823
KW
8326 case NPOSIXD:
8327 if (utf8_target) {
8328 to_complement = 1;
8329 goto utf8_posix;
8330 }
924ba076 8331 /* FALLTHROUGH */
980866de 8332
3018b823
KW
8333 case NPOSIXA:
8334 if (! utf8_target) {
8335 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
8336 scan++;
8337 }
4063ade8 8338 }
3018b823 8339 else {
980866de 8340
3018b823 8341 /* The complement of something that matches only ASCII matches all
837226c8 8342 * non-ASCII, plus everything in ASCII that isn't in the class. */
bedac28b 8343 while (hardcount < max && scan < loceol
837226c8 8344 && (! isASCII_utf8(scan)
3018b823 8345 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 8346 {
3018b823 8347 scan += UTF8SKIP(scan);
ffc61ed2
JH
8348 hardcount++;
8349 }
3018b823
KW
8350 }
8351 break;
980866de 8352
3018b823
KW
8353 case NPOSIXU:
8354 to_complement = 1;
8355 /* FALLTHROUGH */
8356
8357 case POSIXU:
8358 if (! utf8_target) {
8359 while (scan < loceol && to_complement
8360 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 8361 {
3018b823
KW
8362 scan++;
8363 }
cfaf538b
KW
8364 }
8365 else {
c52b8b12 8366 utf8_posix:
3018b823
KW
8367 classnum = (_char_class_number) FLAGS(p);
8368 if (classnum < _FIRST_NON_SWASH_CC) {
8369
8370 /* Here, a swash is needed for above-Latin1 code points.
8371 * Process as many Latin1 code points using the built-in rules.
8372 * Go to another loop to finish processing upon encountering
8373 * the first Latin1 code point. We could do that in this loop
8374 * as well, but the other way saves having to test if the swash
8375 * has been loaded every time through the loop: extra space to
8376 * save a test. */
8377 while (hardcount < max && scan < loceol) {
8378 if (UTF8_IS_INVARIANT(*scan)) {
8379 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
8380 classnum))))
8381 {
8382 break;
8383 }
8384 scan++;
8385 }
8386 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
8387 if (! (to_complement
94bb8c36
KW
8388 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
8389 *(scan + 1)),
3018b823
KW
8390 classnum))))
8391 {
8392 break;
8393 }
8394 scan += 2;
8395 }
8396 else {
8397 goto found_above_latin1;
8398 }
8399
8400 hardcount++;
8401 }
8402 }
8403 else {
8404 /* For these character classes, the knowledge of how to handle
8405 * every code point is compiled in to Perl via a macro. This
8406 * code is written for making the loops as tight as possible.
8407 * It could be refactored to save space instead */
8408 switch (classnum) {
779cf272 8409 case _CC_ENUM_SPACE:
3018b823
KW
8410 while (hardcount < max
8411 && scan < loceol
8412 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
8413 {
8414 scan += UTF8SKIP(scan);
8415 hardcount++;
8416 }
8417 break;
8418 case _CC_ENUM_BLANK:
8419 while (hardcount < max
8420 && scan < loceol
8421 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
8422 {
8423 scan += UTF8SKIP(scan);
8424 hardcount++;
8425 }
8426 break;
8427 case _CC_ENUM_XDIGIT:
8428 while (hardcount < max
8429 && scan < loceol
8430 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
8431 {
8432 scan += UTF8SKIP(scan);
8433 hardcount++;
8434 }
8435 break;
8436 case _CC_ENUM_VERTSPACE:
8437 while (hardcount < max
8438 && scan < loceol
8439 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
8440 {
8441 scan += UTF8SKIP(scan);
8442 hardcount++;
8443 }
8444 break;
8445 case _CC_ENUM_CNTRL:
8446 while (hardcount < max
8447 && scan < loceol
8448 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
8449 {
8450 scan += UTF8SKIP(scan);
8451 hardcount++;
8452 }
8453 break;
8454 default:
8455 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
8456 }
8457 }
a0ed51b3 8458 }
3018b823 8459 break;
4063ade8 8460
3018b823
KW
8461 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
8462
8463 /* Load the swash if not already present */
8464 if (! PL_utf8_swash_ptrs[classnum]) {
8465 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
8466 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
2a16ac92
KW
8467 "utf8",
8468 "",
8469 &PL_sv_undef, 1, 0,
8470 PL_XPosix_ptrs[classnum], &flags);
4063ade8 8471 }
3018b823
KW
8472
8473 while (hardcount < max && scan < loceol
8474 && to_complement ^ cBOOL(_generic_utf8(
8475 classnum,
8476 scan,
8477 swash_fetch(PL_utf8_swash_ptrs[classnum],
8478 (U8 *) scan,
8479 TRUE))))
8480 {
8481 scan += UTF8SKIP(scan);
8482 hardcount++;
8483 }
8484 break;
8485
e1d1eefb 8486 case LNBREAK:
e64f369d
KW
8487 if (utf8_target) {
8488 while (hardcount < max && scan < loceol &&
8489 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
8490 scan += c;
8491 hardcount++;
8492 }
8493 } else {
8494 /* LNBREAK can match one or two latin chars, which is ok, but we
8495 * have to use hardcount in this situation, and throw away the
8496 * adjustment to <loceol> done before the switch statement */
220db18a 8497 loceol = reginfo->strend;
e64f369d
KW
8498 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
8499 scan+=c;
8500 hardcount++;
8501 }
8502 }
8503 break;
e1d1eefb 8504
780fcc9f
KW
8505 case BOUNDL:
8506 case NBOUNDL:
8507 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8508 /* FALLTHROUGH */
584b1f02
KW
8509 case BOUND:
8510 case BOUNDA:
584b1f02
KW
8511 case BOUNDU:
8512 case EOS:
8513 case GPOS:
8514 case KEEPS:
8515 case NBOUND:
8516 case NBOUNDA:
584b1f02
KW
8517 case NBOUNDU:
8518 case OPFAIL:
8519 case SBOL:
8520 case SEOL:
8521 /* These are all 0 width, so match right here or not at all. */
8522 break;
8523
8524 default:
8525 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
a74ff37d 8526 /* NOTREACHED */
661d43c4 8527 NOT_REACHED; /* NOTREACHED */
584b1f02 8528
a0d0e21e 8529 }
a687059c 8530
a0ed51b3
LW
8531 if (hardcount)
8532 c = hardcount;
8533 else
f73aaa43
DM
8534 c = scan - *startposp;
8535 *startposp = scan;
a687059c 8536
a3621e74 8537 DEBUG_r({
e68ec53f 8538 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 8539 DEBUG_EXECUTE_r({
e68ec53f 8540 SV * const prop = sv_newmortal();
8b9781c9 8541 regprop(prog, prop, p, reginfo, NULL);
e68ec53f 8542 PerlIO_printf(Perl_debug_log,
be8e71aa 8543 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
e2e6a0f1 8544 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 8545 });
be8e71aa 8546 });
9041c2e3 8547
a0d0e21e 8548 return(c);
a687059c
LW
8549}
8550
c277df42 8551
be8e71aa 8552#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 8553/*
6c6525b8 8554- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
8555create a copy so that changes the caller makes won't change the shared one.
8556If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 8557 */
ffc61ed2 8558SV *
5aaab254 8559Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 8560{
6c6525b8 8561 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
8562
8563 if (altsvp) {
8564 *altsvp = NULL;
8565 }
8566
ef9bc832 8567 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
6c6525b8 8568}
6c6525b8 8569
3e63bed3 8570#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
ffc61ed2
JH
8571
8572/*
ba7b4546 8573 - reginclass - determine if a character falls into a character class
832705d4 8574
a4525e78 8575 n is the ANYOF-type regnode
6698fab5 8576 p is the target string
3db24e1e 8577 p_end points to one byte beyond the end of the target string
6698fab5 8578 utf8_target tells whether p is in UTF-8.
832705d4 8579
635cd5d4 8580 Returns true if matched; false otherwise.
eba1359e 8581
d5788240
KW
8582 Note that this can be a synthetic start class, a combination of various
8583 nodes, so things you think might be mutually exclusive, such as locale,
8584 aren't. It can match both locale and non-locale
8585
bbce6d69 8586 */
8587
76e3520e 8588STATIC bool
3db24e1e 8589S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
bbce6d69 8590{
27da23d5 8591 dVAR;
a3b680e6 8592 const char flags = ANYOF_FLAGS(n);
bbce6d69 8593 bool match = FALSE;
cc07378b 8594 UV c = *p;
1aa99e6b 8595
7918f24d
NC
8596 PERL_ARGS_ASSERT_REGINCLASS;
8597
afd2eb18
KW
8598 /* If c is not already the code point, get it. Note that
8599 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
8600 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 8601 STRLEN c_len = 0;
3db24e1e 8602 c = utf8n_to_uvchr(p, p_end - p, &c_len,
6182169b
KW
8603 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
8604 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
8605 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
8606 * UTF8_ALLOW_FFFF */
f7ab54c6 8607 if (c_len == (STRLEN)-1)
e8a70c6f 8608 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
a0bd1a30 8609 if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) {
613abc6d
KW
8610 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
8611 }
19f67299 8612 }
4b3cda86 8613
7cdde544 8614 /* If this character is potentially in the bitmap, check it */
dcb20b36 8615 if (c < NUM_ANYOF_CODE_POINTS) {
ffc61ed2
JH
8616 if (ANYOF_BITMAP_TEST(n, c))
8617 match = TRUE;
f240c685
KW
8618 else if ((flags
8619 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
8620 && OP(n) == ANYOFD
93e92956
KW
8621 && ! utf8_target
8622 && ! isASCII(c))
11454c59
KW
8623 {
8624 match = TRUE;
8625 }
1462525b 8626 else if (flags & ANYOF_LOCALE_FLAGS) {
6942fd9a 8627 if ((flags & ANYOF_LOC_FOLD)
e0a1ff7a 8628 && c < 256
6942fd9a
KW
8629 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
8630 {
8631 match = TRUE;
b99851e1 8632 }
e0a1ff7a
KW
8633 else if (ANYOF_POSIXL_TEST_ANY_SET(n)
8634 && c < 256
8635 ) {
31c7f561
KW
8636
8637 /* The data structure is arranged so bits 0, 2, 4, ... are set
8638 * if the class includes the Posix character class given by
8639 * bit/2; and 1, 3, 5, ... are set if the class includes the
8640 * complemented Posix class given by int(bit/2). So we loop
8641 * through the bits, each time changing whether we complement
8642 * the result or not. Suppose for the sake of illustration
8643 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
8644 * is set, it means there is a match for this ANYOF node if the
8645 * character is in the class given by the expression (0 / 2 = 0
8646 * = \w). If it is in that class, isFOO_lc() will return 1,
8647 * and since 'to_complement' is 0, the result will stay TRUE,
8648 * and we exit the loop. Suppose instead that bit 0 is 0, but
8649 * bit 1 is 1. That means there is a match if the character
8650 * matches \W. We won't bother to call isFOO_lc() on bit 0,
8651 * but will on bit 1. On the second iteration 'to_complement'
8652 * will be 1, so the exclusive or will reverse things, so we
8653 * are testing for \W. On the third iteration, 'to_complement'
8654 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
8655 * iteration would test for \S, etc.
8656 *
8657 * Note that this code assumes that all the classes are closed
8658 * under folding. For example, if a character matches \w, then
8659 * its fold does too; and vice versa. This should be true for
8660 * any well-behaved locale for all the currently defined Posix
8661 * classes, except for :lower: and :upper:, which are handled
8662 * by the pseudo-class :cased: which matches if either of the
8663 * other two does. To get rid of this assumption, an outer
8664 * loop could be used below to iterate over both the source
8665 * character, and its fold (if different) */
31c7f561
KW
8666
8667 int count = 0;
8668 int to_complement = 0;
522b3c1e 8669
31c7f561 8670 while (count < ANYOF_MAX) {
8efd3f97 8671 if (ANYOF_POSIXL_TEST(n, count)
31c7f561
KW
8672 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
8673 {
8674 match = TRUE;
8675 break;
8676 }
8677 count++;
8678 to_complement ^= 1;
8679 }
ffc61ed2 8680 }
a0ed51b3 8681 }
a0ed51b3
LW
8682 }
8683
31f05a37 8684
7cdde544 8685 /* If the bitmap didn't (or couldn't) match, and something outside the
3b04b210 8686 * bitmap could match, try that. */
ef87b810 8687 if (!match) {
93e92956
KW
8688 if (c >= NUM_ANYOF_CODE_POINTS
8689 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
8690 {
8691 match = TRUE; /* Everything above the bitmap matches */
e051a21d 8692 }
93e92956
KW
8693 else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
8694 || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
1ee208c4
KW
8695 || ((flags & ANYOF_LOC_FOLD)
8696 && IN_UTF8_CTYPE_LOCALE
93e92956 8697 && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
3b04b210 8698 {
1ee208c4
KW
8699 SV* only_utf8_locale = NULL;
8700 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
ef9bc832 8701 &only_utf8_locale, NULL);
7cdde544 8702 if (sw) {
893ef8be 8703 U8 utf8_buffer[2];
7cdde544
KW
8704 U8 * utf8_p;
8705 if (utf8_target) {
8706 utf8_p = (U8 *) p;
e0193e47 8707 } else { /* Convert to utf8 */
893ef8be
KW
8708 utf8_p = utf8_buffer;
8709 append_utf8_from_native_byte(*p, &utf8_p);
8710 utf8_p = utf8_buffer;
7cdde544 8711 }
f56b6394 8712
e0193e47 8713 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 8714 match = TRUE;
e0193e47 8715 }
7cdde544 8716 }
1ee208c4
KW
8717 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
8718 match = _invlist_contains_cp(only_utf8_locale, c);
8719 }
7cdde544 8720 }
5073ffbd
KW
8721
8722 if (UNICODE_IS_SUPER(c)
f240c685
KW
8723 && (flags
8724 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
8725 && OP(n) != ANYOFD
5073ffbd
KW
8726 && ckWARN_d(WARN_NON_UNICODE))
8727 {
8728 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2d88a86a 8729 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
5073ffbd 8730 }
7cdde544
KW
8731 }
8732
5dbb0c08
KW
8733#if ANYOF_INVERT != 1
8734 /* Depending on compiler optimization cBOOL takes time, so if don't have to
8735 * use it, don't */
8736# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
8737#endif
8738
f0fdc1c9 8739 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
5dbb0c08 8740 return (flags & ANYOF_INVERT) ^ match;
a0ed51b3 8741}
161b471a 8742
dfe13c55 8743STATIC U8 *
ea3daa5d 8744S_reghop3(U8 *s, SSize_t off, const U8* lim)
9041c2e3 8745{
6af86488
KW
8746 /* return the position 'off' UTF-8 characters away from 's', forward if
8747 * 'off' >= 0, backwards if negative. But don't go outside of position
8748 * 'lim', which better be < s if off < 0 */
8749
7918f24d
NC
8750 PERL_ARGS_ASSERT_REGHOP3;
8751
a0ed51b3 8752 if (off >= 0) {
1aa99e6b 8753 while (off-- && s < lim) {
ffc61ed2 8754 /* XXX could check well-formedness here */
a0ed51b3 8755 s += UTF8SKIP(s);
ffc61ed2 8756 }
a0ed51b3
LW
8757 }
8758 else {
1de06328
YO
8759 while (off++ && s > lim) {
8760 s--;
8761 if (UTF8_IS_CONTINUED(*s)) {
8762 while (s > lim && UTF8_IS_CONTINUATION(*s))
8763 s--;
a0ed51b3 8764 }
1de06328 8765 /* XXX could check well-formedness here */
a0ed51b3
LW
8766 }
8767 }
8768 return s;
8769}
161b471a 8770
dfe13c55 8771STATIC U8 *
ea3daa5d 8772S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
1de06328 8773{
7918f24d
NC
8774 PERL_ARGS_ASSERT_REGHOP4;
8775
1de06328
YO
8776 if (off >= 0) {
8777 while (off-- && s < rlim) {
8778 /* XXX could check well-formedness here */
8779 s += UTF8SKIP(s);
8780 }
8781 }
8782 else {
8783 while (off++ && s > llim) {
8784 s--;
8785 if (UTF8_IS_CONTINUED(*s)) {
8786 while (s > llim && UTF8_IS_CONTINUATION(*s))
8787 s--;
8788 }
8789 /* XXX could check well-formedness here */
8790 }
8791 }
8792 return s;
8793}
1de06328 8794
557f47af
DM
8795/* like reghop3, but returns NULL on overrun, rather than returning last
8796 * char pos */
8797
1de06328 8798STATIC U8 *
ea3daa5d 8799S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
a0ed51b3 8800{
7918f24d
NC
8801 PERL_ARGS_ASSERT_REGHOPMAYBE3;
8802
a0ed51b3 8803 if (off >= 0) {
1aa99e6b 8804 while (off-- && s < lim) {
ffc61ed2 8805 /* XXX could check well-formedness here */
a0ed51b3 8806 s += UTF8SKIP(s);
ffc61ed2 8807 }
a0ed51b3 8808 if (off >= 0)
3dab1dad 8809 return NULL;
a0ed51b3
LW
8810 }
8811 else {
1de06328
YO
8812 while (off++ && s > lim) {
8813 s--;
8814 if (UTF8_IS_CONTINUED(*s)) {
8815 while (s > lim && UTF8_IS_CONTINUATION(*s))
8816 s--;
a0ed51b3 8817 }
1de06328 8818 /* XXX could check well-formedness here */
a0ed51b3
LW
8819 }
8820 if (off <= 0)
3dab1dad 8821 return NULL;
a0ed51b3
LW
8822 }
8823 return s;
8824}
51371543 8825
a75351a1
DM
8826
8827/* when executing a regex that may have (?{}), extra stuff needs setting
8828 up that will be visible to the called code, even before the current
8829 match has finished. In particular:
8830
8831 * $_ is localised to the SV currently being matched;
8832 * pos($_) is created if necessary, ready to be updated on each call-out
8833 to code;
8834 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
8835 isn't set until the current pattern is successfully finished), so that
8836 $1 etc of the match-so-far can be seen;
8837 * save the old values of subbeg etc of the current regex, and set then
8838 to the current string (again, this is normally only done at the end
8839 of execution)
a75351a1
DM
8840*/
8841
8842static void
8843S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
8844{
8845 MAGIC *mg;
8846 regexp *const rex = ReANY(reginfo->prog);
bf2039a9 8847 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8adc0f72 8848
8adc0f72 8849 eval_state->rex = rex;
a75351a1 8850
a75351a1
DM
8851 if (reginfo->sv) {
8852 /* Make $_ available to executed code. */
8853 if (reginfo->sv != DEFSV) {
8854 SAVE_DEFSV;
8855 DEFSV_set(reginfo->sv);
8856 }
8857
96c2a8ff 8858 if (!(mg = mg_find_mglob(reginfo->sv))) {
a75351a1 8859 /* prepare for quick setting of pos */
96c2a8ff 8860 mg = sv_magicext_mglob(reginfo->sv);
a75351a1
DM
8861 mg->mg_len = -1;
8862 }
8adc0f72
DM
8863 eval_state->pos_magic = mg;
8864 eval_state->pos = mg->mg_len;
25fdce4a 8865 eval_state->pos_flags = mg->mg_flags;
a75351a1 8866 }
8adc0f72
DM
8867 else
8868 eval_state->pos_magic = NULL;
8869
a75351a1 8870 if (!PL_reg_curpm) {
f65e70f5
DM
8871 /* PL_reg_curpm is a fake PMOP that we can attach the current
8872 * regex to and point PL_curpm at, so that $1 et al are visible
8873 * within a /(?{})/. It's just allocated once per interpreter the
8874 * first time its needed */
a75351a1
DM
8875 Newxz(PL_reg_curpm, 1, PMOP);
8876#ifdef USE_ITHREADS
8877 {
8878 SV* const repointer = &PL_sv_undef;
8879 /* this regexp is also owned by the new PL_reg_curpm, which
8880 will try to free it. */
8881 av_push(PL_regex_padav, repointer);
b9f2b683 8882 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
a75351a1
DM
8883 PL_regex_pad = AvARRAY(PL_regex_padav);
8884 }
8885#endif
8886 }
8887 SET_reg_curpm(reginfo->prog);
8adc0f72 8888 eval_state->curpm = PL_curpm;
a75351a1
DM
8889 PL_curpm = PL_reg_curpm;
8890 if (RXp_MATCH_COPIED(rex)) {
8891 /* Here is a serious problem: we cannot rewrite subbeg,
8892 since it may be needed if this match fails. Thus
8893 $` inside (?{}) could fail... */
8adc0f72
DM
8894 eval_state->subbeg = rex->subbeg;
8895 eval_state->sublen = rex->sublen;
8896 eval_state->suboffset = rex->suboffset;
a8ee055f 8897 eval_state->subcoffset = rex->subcoffset;
a75351a1 8898#ifdef PERL_ANY_COW
8adc0f72 8899 eval_state->saved_copy = rex->saved_copy;
a75351a1
DM
8900#endif
8901 RXp_MATCH_COPIED_off(rex);
8902 }
8903 else
8adc0f72 8904 eval_state->subbeg = NULL;
a75351a1
DM
8905 rex->subbeg = (char *)reginfo->strbeg;
8906 rex->suboffset = 0;
8907 rex->subcoffset = 0;
8908 rex->sublen = reginfo->strend - reginfo->strbeg;
8909}
8910
bf2039a9
DM
8911
8912/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
a75351a1 8913
51371543 8914static void
bf2039a9 8915S_cleanup_regmatch_info_aux(pTHX_ void *arg)
51371543 8916{
bf2039a9
DM
8917 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8918 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
331b2dcc 8919 regmatch_slab *s;
bf2039a9 8920
2ac8ff4b
DM
8921 Safefree(aux->poscache);
8922
331b2dcc 8923 if (eval_state) {
bf2039a9 8924
331b2dcc 8925 /* undo the effects of S_setup_eval_state() */
bf2039a9 8926
331b2dcc
DM
8927 if (eval_state->subbeg) {
8928 regexp * const rex = eval_state->rex;
8929 rex->subbeg = eval_state->subbeg;
8930 rex->sublen = eval_state->sublen;
8931 rex->suboffset = eval_state->suboffset;
8932 rex->subcoffset = eval_state->subcoffset;
db2c6cb3 8933#ifdef PERL_ANY_COW
331b2dcc 8934 rex->saved_copy = eval_state->saved_copy;
ed252734 8935#endif
331b2dcc
DM
8936 RXp_MATCH_COPIED_on(rex);
8937 }
8938 if (eval_state->pos_magic)
25fdce4a 8939 {
331b2dcc 8940 eval_state->pos_magic->mg_len = eval_state->pos;
25fdce4a
FC
8941 eval_state->pos_magic->mg_flags =
8942 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8943 | (eval_state->pos_flags & MGf_BYTES);
8944 }
331b2dcc
DM
8945
8946 PL_curpm = eval_state->curpm;
8adc0f72 8947 }
bf2039a9 8948
331b2dcc
DM
8949 PL_regmatch_state = aux->old_regmatch_state;
8950 PL_regmatch_slab = aux->old_regmatch_slab;
8951
8952 /* free all slabs above current one - this must be the last action
8953 * of this function, as aux and eval_state are allocated within
8954 * slabs and may be freed here */
8955
8956 s = PL_regmatch_slab->next;
8957 if (s) {
8958 PL_regmatch_slab->next = NULL;
8959 while (s) {
8960 regmatch_slab * const osl = s;
8961 s = s->next;
8962 Safefree(osl);
8963 }
8964 }
51371543 8965}
33b8afdf 8966
8adc0f72 8967
33b8afdf 8968STATIC void
5aaab254 8969S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 8970{
7e0d5ad7
KW
8971 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8972 * on the converted value */
8973
a1cac82e 8974 int i = 1;
7918f24d
NC
8975
8976 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8977
a1cac82e
NC
8978 do {
8979 if (prog->substrs->data[i].substr
8980 && !prog->substrs->data[i].utf8_substr) {
8981 SV* const sv = newSVsv(prog->substrs->data[i].substr);
8982 prog->substrs->data[i].utf8_substr = sv;
8983 sv_utf8_upgrade(sv);
610460f9 8984 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 8985 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
8986 /* Trim the trailing \n that fbm_compile added last
8987 time. */
8988 SvCUR_set(sv, SvCUR(sv) - 1);
8989 /* Whilst this makes the SV technically "invalid" (as its
8990 buffer is no longer followed by "\0") when fbm_compile()
8991 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
8992 fbm_compile(sv, FBMcf_TAIL);
8993 } else
8994 fbm_compile(sv, 0);
610460f9 8995 }
a1cac82e
NC
8996 if (prog->substrs->data[i].substr == prog->check_substr)
8997 prog->check_utf8 = sv;
8998 }
8999 } while (i--);
33b8afdf
JH
9000}
9001
7e0d5ad7 9002STATIC bool
5aaab254 9003S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 9004{
7e0d5ad7
KW
9005 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9006 * on the converted value; returns FALSE if can't be converted. */
9007
a1cac82e 9008 int i = 1;
7918f24d
NC
9009
9010 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9011
a1cac82e
NC
9012 do {
9013 if (prog->substrs->data[i].utf8_substr
9014 && !prog->substrs->data[i].substr) {
9015 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
9016 if (! sv_utf8_downgrade(sv, TRUE)) {
9017 return FALSE;
9018 }
5400f398
KW
9019 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9020 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9021 /* Trim the trailing \n that fbm_compile added last
9022 time. */
9023 SvCUR_set(sv, SvCUR(sv) - 1);
9024 fbm_compile(sv, FBMcf_TAIL);
9025 } else
9026 fbm_compile(sv, 0);
9027 }
a1cac82e
NC
9028 prog->substrs->data[i].substr = sv;
9029 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9030 prog->check_substr = sv;
33b8afdf 9031 }
a1cac82e 9032 } while (i--);
7e0d5ad7
KW
9033
9034 return TRUE;
33b8afdf 9035}
66610fdd
RGS
9036
9037/*
14d04a33 9038 * ex: set ts=8 sts=4 sw=4 et:
37442d52 9039 */